//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-26218862
// Cuda compilation tools, release 10.1, V10.1.168
// Based on LLVM 3.4svn
//

.version 6.4
.target sm_75
.address_size 64

	// .globl	kernel_cuda_filter_copy_input
// _ZZ33kernel_filter_construct_transformPKfP8TileInfoii4int4iibPfPiifiiE15shared_features has been demoted
// _ZZ31kernel_filter_construct_gramianiiiiiiiiibPKfS0_PifPfP6float3iE17shared_design_row has been demoted

.visible .entry kernel_cuda_filter_copy_input(
	.param .u64 kernel_cuda_filter_copy_input_param_0,
	.param .u64 kernel_cuda_filter_copy_input_param_1,
	.param .align 16 .b8 kernel_cuda_filter_copy_input_param_2[16],
	.param .u32 kernel_cuda_filter_copy_input_param_3
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<14>;
	.reg .f32 	%f<8>;
	.reg .b32 	%r<92>;
	.reg .b64 	%rd<52>;


	ld.param.u64 	%rd13, [kernel_cuda_filter_copy_input_param_0];
	ld.param.u64 	%rd14, [kernel_cuda_filter_copy_input_param_1];
	ld.param.v4.u32 	{%r16, %r17, %r18, %r19}, [kernel_cuda_filter_copy_input_param_2];
	ld.param.u32 	%r15, [kernel_cuda_filter_copy_input_param_3];
	mov.u32 	%r21, %ctaid.x;
	mov.u32 	%r22, %ntid.x;
	mad.lo.s32 	%r23, %r21, %r22, %r16;
	mov.u32 	%r24, %tid.x;
	add.s32 	%r25, %r23, %r24;
	mov.u32 	%r26, %ctaid.y;
	mov.u32 	%r27, %ntid.y;
	mad.lo.s32 	%r3, %r26, %r27, %r17;
	mov.u32 	%r28, %tid.y;
	add.s32 	%r29, %r3, %r28;
	setp.ge.s32	%p1, %r25, %r18;
	setp.ge.s32	%p2, %r29, %r19;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB0_15;

	cvta.to.global.u64 	%rd16, %rd14;
	add.s64 	%rd1, %rd16, 76;
	ld.global.u32 	%r35, [%rd16+76];
	setp.lt.s32	%p4, %r25, %r35;
	mov.u64 	%rd50, 0;
	mov.u64 	%rd49, %rd50;
	@%p4 bra 	BB0_3;

	ld.global.u32 	%r41, [%rd1+4];
	setp.lt.s32	%p5, %r25, %r41;
	selp.b64	%rd49, 1, 2, %p5;

BB0_3:
	ld.global.u32 	%r44, [%rd1+16];
	setp.lt.s32	%p6, %r29, %r44;
	@%p6 bra 	BB0_5;

	ld.global.u32 	%r47, [%rd1+20];
	setp.lt.s32	%p7, %r29, %r47;
	selp.b64	%rd50, 3, 6, %p7;

BB0_5:
	add.s64 	%rd18, %rd50, %rd49;
	and.b64  	%rd19, %rd18, 4294967295;
	shl.b64 	%rd21, %rd19, 3;
	add.s64 	%rd22, %rd16, %rd21;
	ld.global.u64 	%rd6, [%rd22+176];
	shl.b64 	%rd23, %rd19, 2;
	add.s64 	%rd24, %rd16, %rd23;
	ld.global.u32 	%r5, [%rd24+36];
	mul.lo.s32 	%r52, %r21, %r22;
	add.s32 	%r53, %r52, %r16;
	add.s32 	%r55, %r53, %r24;
	ld.global.u32 	%r6, [%rd24];
	add.s32 	%r56, %r6, %r55;
	mad.lo.s32 	%r57, %r5, %r29, %r56;
	mul.lo.s32 	%r58, %r57, %r15;
	cvt.s64.s32	%rd7, %r58;
	sub.s32 	%r59, %r29, %r17;
	sub.s32 	%r60, %r18, %r16;
	add.s32 	%r61, %r52, %r24;
	mad.lo.s32 	%r62, %r59, %r60, %r61;
	mul.lo.s32 	%r7, %r62, %r15;
	setp.lt.s32	%p8, %r15, 1;
	@%p8 bra 	BB0_15;

	and.b32  	%r66, %r15, 3;
	mov.u32 	%r88, 0;
	setp.eq.s32	%p9, %r66, 0;
	@%p9 bra 	BB0_12;

	setp.eq.s32	%p10, %r66, 1;
	@%p10 bra 	BB0_11;

	setp.eq.s32	%p11, %r66, 2;
	@%p11 bra 	BB0_10;

	shl.b64 	%rd25, %rd7, 2;
	add.s64 	%rd26, %rd6, %rd25;
	ld.f32 	%f1, [%rd26];
	cvta.to.global.u64 	%rd27, %rd13;
	mul.wide.s32 	%rd28, %r7, 4;
	add.s64 	%rd29, %rd27, %rd28;
	st.global.f32 	[%rd29], %f1;
	mov.u32 	%r88, 1;

BB0_10:
	cvt.u64.u32	%rd30, %r88;
	add.s64 	%rd31, %rd30, %rd7;
	shl.b64 	%rd32, %rd31, 2;
	add.s64 	%rd33, %rd6, %rd32;
	ld.f32 	%f2, [%rd33];
	add.s32 	%r68, %r88, %r7;
	cvta.to.global.u64 	%rd34, %rd13;
	mul.wide.s32 	%rd35, %r68, 4;
	add.s64 	%rd36, %rd34, %rd35;
	st.global.f32 	[%rd36], %f2;
	add.s32 	%r88, %r88, 1;

BB0_11:
	cvt.s64.s32	%rd37, %r88;
	add.s64 	%rd38, %rd37, %rd7;
	shl.b64 	%rd39, %rd38, 2;
	add.s64 	%rd40, %rd6, %rd39;
	ld.f32 	%f3, [%rd40];
	add.s32 	%r69, %r88, %r7;
	cvta.to.global.u64 	%rd41, %rd13;
	mul.wide.s32 	%rd42, %r69, 4;
	add.s64 	%rd43, %rd41, %rd42;
	st.global.f32 	[%rd43], %f3;
	add.s32 	%r88, %r88, 1;

BB0_12:
	setp.lt.u32	%p12, %r15, 4;
	@%p12 bra 	BB0_15;

	mul.wide.s32 	%rd51, %r88, 4;
	add.s32 	%r74, %r24, %r52;
	mad.lo.s32 	%r78, %r26, %r27, %r28;
	mad.lo.s32 	%r80, %r78, %r60, %r74;
	mul.lo.s32 	%r81, %r15, %r80;
	add.s32 	%r84, %r55, %r6;
	mad.lo.s32 	%r86, %r5, %r29, %r84;
	mul.lo.s32 	%r87, %r15, %r86;
	cvta.to.global.u64 	%rd44, %rd13;
	mul.wide.s32 	%rd45, %r81, 4;
	add.s64 	%rd9, %rd44, %rd45;
	mul.wide.s32 	%rd46, %r87, 4;
	add.s64 	%rd10, %rd6, %rd46;

BB0_14:
	add.s64 	%rd47, %rd10, %rd51;
	ld.f32 	%f4, [%rd47];
	add.s64 	%rd48, %rd9, %rd51;
	st.global.f32 	[%rd48], %f4;
	ld.f32 	%f5, [%rd47+4];
	st.global.f32 	[%rd48+4], %f5;
	ld.f32 	%f6, [%rd47+8];
	st.global.f32 	[%rd48+8], %f6;
	ld.f32 	%f7, [%rd47+12];
	st.global.f32 	[%rd48+12], %f7;
	add.s64 	%rd51, %rd51, 16;
	add.s32 	%r88, %r88, 4;
	setp.lt.s32	%p13, %r88, %r15;
	@%p13 bra 	BB0_14;

BB0_15:
	ret;
}

	// .globl	kernel_cuda_filter_convert_to_rgb
.visible .entry kernel_cuda_filter_convert_to_rgb(
	.param .u64 kernel_cuda_filter_convert_to_rgb_param_0,
	.param .u64 kernel_cuda_filter_convert_to_rgb_param_1,
	.param .u32 kernel_cuda_filter_convert_to_rgb_param_2,
	.param .u32 kernel_cuda_filter_convert_to_rgb_param_3,
	.param .u32 kernel_cuda_filter_convert_to_rgb_param_4,
	.param .u32 kernel_cuda_filter_convert_to_rgb_param_5,
	.param .align 4 .b8 kernel_cuda_filter_convert_to_rgb_param_6[12],
	.param .u32 kernel_cuda_filter_convert_to_rgb_param_7,
	.param .u32 kernel_cuda_filter_convert_to_rgb_param_8
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<7>;
	.reg .f32 	%f<30>;
	.reg .b32 	%r<35>;
	.reg .b64 	%rd<35>;


	ld.param.u64 	%rd3, [kernel_cuda_filter_convert_to_rgb_param_0];
	ld.param.u64 	%rd4, [kernel_cuda_filter_convert_to_rgb_param_1];
	ld.param.u32 	%r4, [kernel_cuda_filter_convert_to_rgb_param_2];
	ld.param.u32 	%r5, [kernel_cuda_filter_convert_to_rgb_param_3];
	ld.param.u32 	%r6, [kernel_cuda_filter_convert_to_rgb_param_4];
	ld.param.u32 	%r7, [kernel_cuda_filter_convert_to_rgb_param_5];
	ld.param.u32 	%r10, [kernel_cuda_filter_convert_to_rgb_param_6+8];
	ld.param.u32 	%r9, [kernel_cuda_filter_convert_to_rgb_param_6+4];
	ld.param.u32 	%r1, [kernel_cuda_filter_convert_to_rgb_param_6];
	ld.param.u32 	%r11, [kernel_cuda_filter_convert_to_rgb_param_7];
	ld.param.u32 	%r12, [kernel_cuda_filter_convert_to_rgb_param_8];
	cvta.to.global.u64 	%rd1, %rd3;
	cvta.to.global.u64 	%rd2, %rd4;
	mov.u32 	%r13, %ctaid.x;
	mov.u32 	%r14, %ntid.x;
	mov.u32 	%r15, %tid.x;
	mad.lo.s32 	%r2, %r13, %r14, %r15;
	mov.u32 	%r16, %ctaid.y;
	mov.u32 	%r17, %ntid.y;
	mov.u32 	%r18, %tid.y;
	mad.lo.s32 	%r3, %r16, %r17, %r18;
	setp.ge.s32	%p1, %r2, %r4;
	setp.ge.s32	%p2, %r3, %r5;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB1_7;

	setp.lt.s32	%p4, %r11, 1;
	@%p4 bra 	BB1_3;

	mul.lo.s32 	%r19, %r2, %r7;
	cvt.s64.s32	%rd5, %r19;
	mad.lo.s32 	%r20, %r3, %r6, %r1;
	cvt.s64.s32	%rd6, %r20;
	shr.u64 	%rd7, %rd6, 2;
	add.s64 	%rd8, %rd7, %rd5;
	shl.b64 	%rd9, %rd8, 2;
	add.s64 	%rd10, %rd2, %rd9;
	mad.lo.s32 	%r21, %r3, %r4, %r2;
	mul.lo.s32 	%r22, %r21, 3;
	mul.wide.s32 	%rd11, %r22, 4;
	add.s64 	%rd12, %rd1, %rd11;
	cvt.rn.f32.s32	%f1, %r12;
	ld.global.f32 	%f2, [%rd10];
	div.approx.ftz.f32 	%f3, %f2, %f1;
	mov.f32 	%f4, 0f00000000;
	max.ftz.f32 	%f5, %f3, %f4;
	mov.f32 	%f6, 0f461C4000;
	min.ftz.f32 	%f7, %f5, %f6;
	st.global.f32 	[%rd12], %f7;
	ld.global.f32 	%f8, [%rd10+4];
	div.approx.ftz.f32 	%f9, %f8, %f1;
	max.ftz.f32 	%f10, %f9, %f4;
	min.ftz.f32 	%f11, %f10, %f6;
	st.global.f32 	[%rd12+4], %f11;
	ld.global.f32 	%f12, [%rd10+8];
	div.approx.ftz.f32 	%f13, %f12, %f1;
	max.ftz.f32 	%f14, %f13, %f4;
	min.ftz.f32 	%f15, %f14, %f6;
	st.global.f32 	[%rd12+8], %f15;

BB1_3:
	setp.lt.s32	%p5, %r11, 2;
	@%p5 bra 	BB1_5;

	mul.lo.s32 	%r23, %r2, %r7;
	cvt.s64.s32	%rd13, %r23;
	mad.lo.s32 	%r24, %r3, %r6, %r9;
	cvt.s64.s32	%rd14, %r24;
	shr.u64 	%rd15, %rd14, 2;
	add.s64 	%rd16, %rd15, %rd13;
	shl.b64 	%rd17, %rd16, 2;
	add.s64 	%rd18, %rd2, %rd17;
	mad.lo.s32 	%r25, %r3, %r4, %r2;
	mul.lo.s32 	%r26, %r25, 3;
	cvt.s64.s32	%rd19, %r26;
	mul.lo.s32 	%r27, %r4, %r5;
	mul.lo.s32 	%r28, %r27, 3;
	cvt.s64.s32	%rd20, %r28;
	add.s64 	%rd21, %rd19, %rd20;
	shl.b64 	%rd22, %rd21, 2;
	add.s64 	%rd23, %rd1, %rd22;
	cvt.rn.f32.s32	%f16, %r12;
	ld.global.f32 	%f17, [%rd18];
	div.approx.ftz.f32 	%f18, %f17, %f16;
	st.global.f32 	[%rd23], %f18;
	ld.global.f32 	%f19, [%rd18+4];
	div.approx.ftz.f32 	%f20, %f19, %f16;
	st.global.f32 	[%rd23+4], %f20;
	ld.global.f32 	%f21, [%rd18+8];
	div.approx.ftz.f32 	%f22, %f21, %f16;
	st.global.f32 	[%rd23+8], %f22;

BB1_5:
	setp.lt.s32	%p6, %r11, 3;
	@%p6 bra 	BB1_7;

	mul.lo.s32 	%r29, %r2, %r7;
	cvt.s64.s32	%rd24, %r29;
	mad.lo.s32 	%r30, %r3, %r6, %r10;
	cvt.s64.s32	%rd25, %r30;
	shr.u64 	%rd26, %rd25, 2;
	add.s64 	%rd27, %rd26, %rd24;
	shl.b64 	%rd28, %rd27, 2;
	add.s64 	%rd29, %rd2, %rd28;
	mad.lo.s32 	%r31, %r3, %r4, %r2;
	mul.lo.s32 	%r32, %r31, 3;
	cvt.s64.s32	%rd30, %r32;
	mul.lo.s32 	%r33, %r4, %r5;
	mul.lo.s32 	%r34, %r33, 6;
	cvt.s64.s32	%rd31, %r34;
	add.s64 	%rd32, %rd30, %rd31;
	shl.b64 	%rd33, %rd32, 2;
	add.s64 	%rd34, %rd1, %rd33;
	cvt.rn.f32.s32	%f23, %r12;
	ld.global.f32 	%f24, [%rd29];
	div.approx.ftz.f32 	%f25, %f24, %f23;
	st.global.f32 	[%rd34], %f25;
	ld.global.f32 	%f26, [%rd29+4];
	div.approx.ftz.f32 	%f27, %f26, %f23;
	st.global.f32 	[%rd34+4], %f27;
	ld.global.f32 	%f28, [%rd29+8];
	div.approx.ftz.f32 	%f29, %f28, %f23;
	st.global.f32 	[%rd34+8], %f29;

BB1_7:
	ret;
}

	// .globl	kernel_cuda_filter_convert_from_rgb
.visible .entry kernel_cuda_filter_convert_from_rgb(
	.param .u64 kernel_cuda_filter_convert_from_rgb_param_0,
	.param .u64 kernel_cuda_filter_convert_from_rgb_param_1,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_2,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_3,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_4,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_5,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_6,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_7,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_8,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_9,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_10,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_11,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_12,
	.param .u32 kernel_cuda_filter_convert_from_rgb_param_13
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<8>;
	.reg .b32 	%r<29>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd1, [kernel_cuda_filter_convert_from_rgb_param_0];
	ld.param.u64 	%rd2, [kernel_cuda_filter_convert_from_rgb_param_1];
	ld.param.u32 	%r3, [kernel_cuda_filter_convert_from_rgb_param_2];
	ld.param.u32 	%r4, [kernel_cuda_filter_convert_from_rgb_param_3];
	ld.param.u32 	%r5, [kernel_cuda_filter_convert_from_rgb_param_4];
	ld.param.u32 	%r6, [kernel_cuda_filter_convert_from_rgb_param_6];
	ld.param.u32 	%r7, [kernel_cuda_filter_convert_from_rgb_param_7];
	ld.param.u32 	%r12, [kernel_cuda_filter_convert_from_rgb_param_8];
	ld.param.u32 	%r13, [kernel_cuda_filter_convert_from_rgb_param_9];
	ld.param.u32 	%r8, [kernel_cuda_filter_convert_from_rgb_param_10];
	ld.param.u32 	%r9, [kernel_cuda_filter_convert_from_rgb_param_11];
	ld.param.u32 	%r10, [kernel_cuda_filter_convert_from_rgb_param_12];
	ld.param.u32 	%r11, [kernel_cuda_filter_convert_from_rgb_param_13];
	mov.u32 	%r14, %ctaid.x;
	mov.u32 	%r15, %ntid.x;
	mov.u32 	%r16, %tid.x;
	mad.lo.s32 	%r1, %r14, %r15, %r16;
	mov.u32 	%r17, %ctaid.y;
	mov.u32 	%r18, %ntid.y;
	mov.u32 	%r19, %tid.y;
	mad.lo.s32 	%r2, %r17, %r18, %r19;
	setp.ge.s32	%p1, %r1, %r12;
	setp.ge.s32	%p2, %r2, %r13;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB2_2;

	cvta.to.global.u64 	%rd3, %rd2;
	cvta.to.global.u64 	%rd4, %rd1;
	add.s32 	%r20, %r1, %r3;
	add.s32 	%r21, %r2, %r4;
	mad.lo.s32 	%r22, %r21, %r5, %r20;
	mul.lo.s32 	%r23, %r22, 3;
	mul.wide.s32 	%rd5, %r23, 4;
	add.s64 	%rd6, %rd4, %rd5;
	add.s32 	%r24, %r2, %r7;
	add.s32 	%r25, %r8, %r6;
	add.s32 	%r26, %r25, %r1;
	mad.lo.s32 	%r27, %r24, %r9, %r26;
	mul.lo.s32 	%r28, %r27, %r10;
	mul.wide.s32 	%rd7, %r28, 4;
	add.s64 	%rd8, %rd3, %rd7;
	cvt.rn.f32.s32	%f1, %r11;
	ld.global.f32 	%f2, [%rd6];
	mul.ftz.f32 	%f3, %f1, %f2;
	st.global.f32 	[%rd8], %f3;
	ld.global.f32 	%f4, [%rd6+4];
	mul.ftz.f32 	%f5, %f1, %f4;
	st.global.f32 	[%rd8+4], %f5;
	ld.global.f32 	%f6, [%rd6+8];
	mul.ftz.f32 	%f7, %f1, %f6;
	st.global.f32 	[%rd8+8], %f7;

BB2_2:
	ret;
}

	// .globl	kernel_cuda_filter_divide_shadow
.visible .entry kernel_cuda_filter_divide_shadow(
	.param .u32 kernel_cuda_filter_divide_shadow_param_0,
	.param .u64 kernel_cuda_filter_divide_shadow_param_1,
	.param .u64 kernel_cuda_filter_divide_shadow_param_2,
	.param .u64 kernel_cuda_filter_divide_shadow_param_3,
	.param .u64 kernel_cuda_filter_divide_shadow_param_4,
	.param .u64 kernel_cuda_filter_divide_shadow_param_5,
	.param .u64 kernel_cuda_filter_divide_shadow_param_6,
	.param .align 16 .b8 kernel_cuda_filter_divide_shadow_param_7[16],
	.param .u32 kernel_cuda_filter_divide_shadow_param_8,
	.param .u32 kernel_cuda_filter_divide_shadow_param_9
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<8>;
	.reg .f32 	%f<42>;
	.reg .b32 	%r<82>;
	.reg .b64 	%rd<41>;


	ld.param.u32 	%r4, [kernel_cuda_filter_divide_shadow_param_0];
	ld.param.u64 	%rd6, [kernel_cuda_filter_divide_shadow_param_1];
	ld.param.u64 	%rd7, [kernel_cuda_filter_divide_shadow_param_2];
	ld.param.u64 	%rd8, [kernel_cuda_filter_divide_shadow_param_3];
	ld.param.u64 	%rd9, [kernel_cuda_filter_divide_shadow_param_4];
	ld.param.u64 	%rd10, [kernel_cuda_filter_divide_shadow_param_5];
	ld.param.u64 	%rd11, [kernel_cuda_filter_divide_shadow_param_6];
	ld.param.v4.u32 	{%r7, %r8, %r9, %r10}, [kernel_cuda_filter_divide_shadow_param_7];
	ld.param.u32 	%r5, [kernel_cuda_filter_divide_shadow_param_8];
	ld.param.u32 	%r6, [kernel_cuda_filter_divide_shadow_param_9];
	mov.u32 	%r12, %ctaid.x;
	mov.u32 	%r13, %ntid.x;
	mad.lo.s32 	%r14, %r12, %r13, %r7;
	mov.u32 	%r15, %tid.x;
	add.s32 	%r16, %r14, %r15;
	mov.u32 	%r17, %ctaid.y;
	mov.u32 	%r18, %ntid.y;
	mad.lo.s32 	%r19, %r17, %r18, %r8;
	mov.u32 	%r20, %tid.y;
	add.s32 	%r21, %r19, %r20;
	setp.ge.s32	%p1, %r16, %r9;
	setp.ge.s32	%p2, %r21, %r10;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB3_6;

	cvta.to.global.u64 	%rd13, %rd6;
	add.s64 	%rd1, %rd13, 76;
	ld.global.u32 	%r27, [%rd13+76];
	setp.lt.s32	%p4, %r16, %r27;
	mov.u64 	%rd40, 0;
	mov.u64 	%rd39, %rd40;
	@%p4 bra 	BB3_3;

	ld.global.u32 	%r33, [%rd1+4];
	setp.lt.s32	%p5, %r16, %r33;
	selp.b64	%rd39, 1, 2, %p5;

BB3_3:
	ld.global.u32 	%r39, [%rd1+16];
	setp.lt.s32	%p6, %r21, %r39;
	@%p6 bra 	BB3_5;

	ld.global.u32 	%r45, [%rd1+20];
	setp.lt.s32	%p7, %r21, %r45;
	selp.b64	%rd40, 3, 6, %p7;

BB3_5:
	add.s64 	%rd15, %rd40, %rd39;
	and.b64  	%rd16, %rd15, 4294967295;
	shl.b64 	%rd18, %rd16, 2;
	add.s64 	%rd19, %rd13, %rd18;
	shl.b64 	%rd20, %rd16, 3;
	add.s64 	%rd21, %rd13, %rd20;
	ld.global.u64 	%rd22, [%rd21+176];
	ld.global.u32 	%r51, [%rd19+36];
	mul.lo.s32 	%r54, %r12, %r13;
	add.s32 	%r55, %r54, %r7;
	add.s32 	%r57, %r55, %r15;
	ld.global.u32 	%r58, [%rd19];
	add.s32 	%r59, %r58, %r57;
	mad.lo.s32 	%r60, %r51, %r21, %r59;
	mul.lo.s32 	%r61, %r60, %r5;
	cvt.s64.s32	%rd23, %r61;
	cvt.s64.s32	%rd24, %r6;
	mov.u32 	%r62, 3;
	sub.s32 	%r63, %r62, %r7;
	add.s32 	%r64, %r63, %r9;
	and.b32  	%r65, %r64, -4;
	sub.s32 	%r66, %r21, %r8;
	add.s32 	%r67, %r54, %r15;
	mad.lo.s32 	%r68, %r66, %r65, %r67;
	add.s64 	%rd25, %rd23, %rd24;
	shl.b64 	%rd26, %rd25, 2;
	add.s64 	%rd27, %rd26, %rd22;
	ld.f32 	%f1, [%rd27+56];
	mov.f32 	%f2, 0f33D6BF95;
	max.ftz.f32 	%f3, %f1, %f2;
	ld.f32 	%f4, [%rd27+60];
	div.approx.ftz.f32 	%f5, %f4, %f3;
	cvta.to.global.u64 	%rd28, %rd7;
	mul.wide.s32 	%rd29, %r68, 4;
	add.s64 	%rd30, %rd28, %rd29;
	st.global.f32 	[%rd30], %f5;
	ld.f32 	%f6, [%rd27+68];
	max.ftz.f32 	%f7, %f6, %f2;
	ld.f32 	%f8, [%rd27+72];
	div.approx.ftz.f32 	%f9, %f8, %f7;
	cvta.to.global.u64 	%rd31, %rd8;
	add.s64 	%rd32, %rd31, %rd29;
	st.global.f32 	[%rd32], %f9;
	mov.u32 	%r69, 1;
	add.s32 	%r70, %r4, 1;
	shr.u32 	%r71, %r70, 31;
	add.s32 	%r72, %r70, %r71;
	shr.s32 	%r73, %r72, 1;
	ld.global.f32 	%f10, [%rd30];
	mul.ftz.f32 	%f11, %f10, %f10;
	cvt.rn.f32.s32	%f12, %r73;
	mul.ftz.f32 	%f13, %f12, %f11;
	ld.f32 	%f14, [%rd27+64];
	sub.ftz.f32 	%f15, %f14, %f13;
	mov.f32 	%f16, 0f00000000;
	max.ftz.f32 	%f17, %f16, %f15;
	mul.ftz.f32 	%f18, %f9, %f9;
	shr.u32 	%r74, %r4, 31;
	add.s32 	%r75, %r4, %r74;
	shr.s32 	%r76, %r75, 1;
	cvt.rn.f32.s32	%f19, %r76;
	mul.ftz.f32 	%f20, %f19, %f18;
	ld.f32 	%f21, [%rd27+76];
	sub.ftz.f32 	%f22, %f21, %f20;
	max.ftz.f32 	%f23, %f16, %f22;
	add.s32 	%r77, %r73, -1;
	max.s32 	%r78, %r77, %r69;
	cvt.rn.f32.s32	%f24, %r78;
	div.approx.ftz.f32 	%f25, %f17, %f24;
	add.s32 	%r79, %r76, -1;
	max.s32 	%r80, %r79, %r69;
	cvt.rn.f32.s32	%f26, %r80;
	div.approx.ftz.f32 	%f27, %f23, %f26;
	add.ftz.f32 	%f28, %f25, %f27;
	mul.ftz.f32 	%f29, %f28, 0f3F000000;
	cvt.rn.f32.s32	%f30, %r4;
	div.approx.ftz.f32 	%f31, %f29, %f30;
	cvta.to.global.u64 	%rd33, %rd9;
	add.s64 	%rd34, %rd33, %rd29;
	st.global.f32 	[%rd34], %f31;
	sub.ftz.f32 	%f32, %f25, %f27;
	mul.ftz.f32 	%f33, %f32, 0f3F000000;
	mul.ftz.f32 	%f34, %f32, %f33;
	mul.lo.s32 	%r81, %r4, %r4;
	cvt.rn.f32.s32	%f35, %r81;
	div.approx.ftz.f32 	%f36, %f34, %f35;
	cvta.to.global.u64 	%rd35, %rd10;
	add.s64 	%rd36, %rd35, %rd29;
	st.global.f32 	[%rd36], %f36;
	ld.global.f32 	%f37, [%rd32];
	ld.global.f32 	%f38, [%rd30];
	sub.ftz.f32 	%f39, %f38, %f37;
	mul.ftz.f32 	%f40, %f39, 0f3F000000;
	mul.ftz.f32 	%f41, %f39, %f40;
	cvta.to.global.u64 	%rd37, %rd11;
	add.s64 	%rd38, %rd37, %rd29;
	st.global.f32 	[%rd38], %f41;

BB3_6:
	ret;
}

	// .globl	kernel_cuda_filter_get_feature
.visible .entry kernel_cuda_filter_get_feature(
	.param .u32 kernel_cuda_filter_get_feature_param_0,
	.param .u64 kernel_cuda_filter_get_feature_param_1,
	.param .u32 kernel_cuda_filter_get_feature_param_2,
	.param .u32 kernel_cuda_filter_get_feature_param_3,
	.param .u64 kernel_cuda_filter_get_feature_param_4,
	.param .u64 kernel_cuda_filter_get_feature_param_5,
	.param .f32 kernel_cuda_filter_get_feature_param_6,
	.param .align 16 .b8 kernel_cuda_filter_get_feature_param_7[16],
	.param .u32 kernel_cuda_filter_get_feature_param_8,
	.param .u32 kernel_cuda_filter_get_feature_param_9
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<10>;
	.reg .f32 	%f<13>;
	.reg .b32 	%r<45>;
	.reg .b64 	%rd<39>;


	ld.param.u32 	%r8, [kernel_cuda_filter_get_feature_param_0];
	ld.param.u64 	%rd9, [kernel_cuda_filter_get_feature_param_1];
	ld.param.u32 	%r9, [kernel_cuda_filter_get_feature_param_2];
	ld.param.u32 	%r10, [kernel_cuda_filter_get_feature_param_3];
	ld.param.u64 	%rd10, [kernel_cuda_filter_get_feature_param_4];
	ld.param.u64 	%rd11, [kernel_cuda_filter_get_feature_param_5];
	ld.param.f32 	%f2, [kernel_cuda_filter_get_feature_param_6];
	ld.param.v4.u32 	{%r13, %r14, %r15, %r16}, [kernel_cuda_filter_get_feature_param_7];
	ld.param.u32 	%r11, [kernel_cuda_filter_get_feature_param_8];
	ld.param.u32 	%r12, [kernel_cuda_filter_get_feature_param_9];
	mov.u32 	%r18, %ctaid.x;
	mov.u32 	%r19, %ntid.x;
	mul.lo.s32 	%r1, %r18, %r19;
	add.s32 	%r20, %r1, %r13;
	mov.u32 	%r21, %tid.x;
	add.s32 	%r3, %r20, %r21;
	mov.u32 	%r22, %ctaid.y;
	mov.u32 	%r23, %ntid.y;
	mad.lo.s32 	%r24, %r22, %r23, %r14;
	mov.u32 	%r25, %tid.y;
	add.s32 	%r5, %r24, %r25;
	setp.ge.s32	%p1, %r3, %r15;
	setp.ge.s32	%p2, %r5, %r16;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB4_9;

	cvta.to.global.u64 	%rd13, %rd9;
	add.s64 	%rd1, %rd13, 76;
	ld.global.u32 	%r26, [%rd13+76];
	setp.lt.s32	%p4, %r3, %r26;
	mov.u64 	%rd38, 0;
	mov.u64 	%rd37, %rd38;
	@%p4 bra 	BB4_3;

	ld.global.u32 	%r27, [%rd1+4];
	setp.lt.s32	%p5, %r3, %r27;
	selp.b64	%rd37, 1, 2, %p5;

BB4_3:
	ld.global.u32 	%r28, [%rd1+16];
	setp.lt.s32	%p6, %r5, %r28;
	@%p6 bra 	BB4_5;

	ld.global.u32 	%r29, [%rd1+20];
	setp.lt.s32	%p7, %r5, %r29;
	selp.b64	%rd38, 3, 6, %p7;

BB4_5:
	add.s64 	%rd15, %rd38, %rd37;
	and.b64  	%rd16, %rd15, 4294967295;
	shl.b64 	%rd18, %rd16, 3;
	add.s64 	%rd19, %rd13, %rd18;
	ld.global.u64 	%rd6, [%rd19+176];
	shl.b64 	%rd20, %rd16, 2;
	add.s64 	%rd21, %rd13, %rd20;
	ld.global.u32 	%r30, [%rd21+36];
	ld.global.u32 	%r31, [%rd21];
	add.s32 	%r32, %r31, %r3;
	mad.lo.s32 	%r33, %r30, %r5, %r32;
	mul.lo.s32 	%r34, %r33, %r11;
	cvt.s64.s32	%rd22, %r34;
	cvt.s64.s32	%rd23, %r12;
	add.s64 	%rd7, %rd22, %rd23;
	mov.u32 	%r35, 3;
	sub.s32 	%r36, %r35, %r13;
	add.s32 	%r37, %r36, %r15;
	and.b32  	%r38, %r37, -4;
	sub.s32 	%r39, %r5, %r14;
	add.s32 	%r41, %r1, %r21;
	mad.lo.s32 	%r7, %r39, %r38, %r41;
	cvt.s64.s32	%rd24, %r9;
	add.s64 	%rd25, %rd7, %rd24;
	shl.b64 	%rd26, %rd25, 2;
	add.s64 	%rd27, %rd6, %rd26;
	ld.f32 	%f3, [%rd27];
	mul.ftz.f32 	%f1, %f3, %f2;
	cvta.to.global.u64 	%rd28, %rd10;
	mul.wide.s32 	%rd29, %r7, 4;
	add.s64 	%rd30, %rd28, %rd29;
	st.global.f32 	[%rd30], %f1;
	setp.lt.s32	%p8, %r10, 0;
	@%p8 bra 	BB4_9;

	cvta.to.global.u64 	%rd31, %rd11;
	setp.gt.s32	%p9, %r8, 1;
	add.s64 	%rd8, %rd31, %rd29;
	@%p9 bra 	BB4_8;
	bra.uni 	BB4_7;

BB4_8:
	cvt.s64.s32	%rd33, %r10;
	add.s64 	%rd34, %rd7, %rd33;
	shl.b64 	%rd35, %rd34, 2;
	add.s64 	%rd36, %rd6, %rd35;
	cvt.rn.f32.s32	%f4, %r8;
	mul.ftz.f32 	%f5, %f1, %f1;
	mul.ftz.f32 	%f6, %f4, %f5;
	ld.f32 	%f7, [%rd36];
	sub.ftz.f32 	%f8, %f7, %f6;
	add.s32 	%r43, %r8, -1;
	mul.lo.s32 	%r44, %r43, %r8;
	cvt.rn.f32.s32	%f9, %r44;
	div.approx.ftz.f32 	%f10, %f8, %f9;
	mov.f32 	%f11, 0f00000000;
	max.ftz.f32 	%f12, %f11, %f10;
	st.global.f32 	[%rd8], %f12;
	bra.uni 	BB4_9;

BB4_7:
	mov.u32 	%r42, 1343554297;
	st.global.u32 	[%rd8], %r42;

BB4_9:
	ret;
}

	// .globl	kernel_cuda_filter_write_feature
.visible .entry kernel_cuda_filter_write_feature(
	.param .u32 kernel_cuda_filter_write_feature_param_0,
	.param .align 16 .b8 kernel_cuda_filter_write_feature_param_1[16],
	.param .align 16 .b8 kernel_cuda_filter_write_feature_param_2[16],
	.param .u64 kernel_cuda_filter_write_feature_param_3,
	.param .u64 kernel_cuda_filter_write_feature_param_4,
	.param .u32 kernel_cuda_filter_write_feature_param_5,
	.param .align 16 .b8 kernel_cuda_filter_write_feature_param_6[16]
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<2>;
	.reg .b32 	%r<46>;
	.reg .b64 	%rd<9>;


	ld.param.v4.u32 	{%r16, %r17, %r18, %r19}, [kernel_cuda_filter_write_feature_param_1];
	ld.param.v4.u32 	{%r20, %r21, %r22, %r23}, [kernel_cuda_filter_write_feature_param_2];
	ld.param.u64 	%rd1, [kernel_cuda_filter_write_feature_param_3];
	ld.param.u64 	%rd2, [kernel_cuda_filter_write_feature_param_4];
	ld.param.u32 	%r11, [kernel_cuda_filter_write_feature_param_5];
	ld.param.v4.u32 	{%r24, %r25, %r26, %r27}, [kernel_cuda_filter_write_feature_param_6];
	mov.u32 	%r28, %ctaid.x;
	mov.u32 	%r29, %ntid.x;
	mov.u32 	%r30, %tid.x;
	mad.lo.s32 	%r1, %r28, %r29, %r30;
	mov.u32 	%r31, %ctaid.y;
	mov.u32 	%r32, %ntid.y;
	mov.u32 	%r33, %tid.y;
	mad.lo.s32 	%r2, %r31, %r32, %r33;
	setp.ge.s32	%p1, %r1, %r22;
	setp.ge.s32	%p2, %r2, %r23;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB5_2;

	add.s32 	%r34, %r1, %r20;
	add.s32 	%r35, %r2, %r21;
	add.s32 	%r36, %r34, %r16;
	mad.lo.s32 	%r37, %r35, %r17, %r36;
	mov.u32 	%r38, 3;
	sub.s32 	%r39, %r38, %r24;
	add.s32 	%r40, %r39, %r26;
	and.b32  	%r41, %r40, -4;
	sub.s32 	%r42, %r35, %r25;
	sub.s32 	%r43, %r34, %r24;
	mad.lo.s32 	%r44, %r42, %r41, %r43;
	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.s32 	%rd4, %r44, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	mad.lo.s32 	%r45, %r37, %r18, %r11;
	cvta.to.global.u64 	%rd6, %rd2;
	mul.wide.s32 	%rd7, %r45, 4;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f32 	[%rd8], %f1;

BB5_2:
	ret;
}

	// .globl	kernel_cuda_filter_detect_outliers
.visible .entry kernel_cuda_filter_detect_outliers(
	.param .u64 kernel_cuda_filter_detect_outliers_param_0,
	.param .u64 kernel_cuda_filter_detect_outliers_param_1,
	.param .u64 kernel_cuda_filter_detect_outliers_param_2,
	.param .u64 kernel_cuda_filter_detect_outliers_param_3,
	.param .align 16 .b8 kernel_cuda_filter_detect_outliers_param_4[16],
	.param .u32 kernel_cuda_filter_detect_outliers_param_5
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.local .align 4 .b8 	__local_depot6[100];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<30>;
	.reg .f32 	%f<123>;
	.reg .b32 	%r<108>;
	.reg .b64 	%rd<69>;


	mov.u64 	%SPL, __local_depot6;
	cvta.local.u64 	%SP, %SPL;
	ld.param.u64 	%rd5, [kernel_cuda_filter_detect_outliers_param_0];
	ld.param.u64 	%rd6, [kernel_cuda_filter_detect_outliers_param_1];
	ld.param.u64 	%rd7, [kernel_cuda_filter_detect_outliers_param_2];
	ld.param.u64 	%rd8, [kernel_cuda_filter_detect_outliers_param_3];
	ld.param.v4.u32 	{%r35, %r36, %r37, %r38}, [kernel_cuda_filter_detect_outliers_param_4];
	ld.param.u32 	%r34, [kernel_cuda_filter_detect_outliers_param_5];
	mov.u32 	%r39, %ctaid.x;
	mov.u32 	%r40, %ntid.x;
	mad.lo.s32 	%r41, %r39, %r40, %r35;
	mov.u32 	%r42, %tid.x;
	add.s32 	%r2, %r41, %r42;
	mov.u32 	%r43, %ctaid.y;
	mov.u32 	%r44, %ntid.y;
	mad.lo.s32 	%r45, %r43, %r44, %r36;
	mov.u32 	%r46, %tid.y;
	add.s32 	%r4, %r45, %r46;
	setp.ge.s32	%p1, %r2, %r37;
	setp.ge.s32	%p2, %r4, %r38;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB6_30;

	mov.u32 	%r48, 3;
	sub.s32 	%r49, %r48, %r35;
	add.s32 	%r50, %r49, %r37;
	and.b32  	%r7, %r50, -4;
	add.s32 	%r51, %r4, -2;
	max.s32 	%r96, %r51, %r36;
	add.s32 	%r52, %r4, 3;
	min.s32 	%r9, %r52, %r38;
	mov.u32 	%r106, 0;
	mov.f32 	%f42, 0f00000000;
	setp.ge.s32	%p4, %r96, %r9;
	mov.f32 	%f109, %f42;
	@%p4 bra 	BB6_22;

	add.s32 	%r54, %r2, -2;
	max.s32 	%r10, %r54, %r35;
	add.s32 	%r55, %r2, 3;
	min.s32 	%r11, %r55, %r37;
	mov.u32 	%r106, 0;
	mov.f32 	%f109, 0f00000000;

BB6_3:
	setp.ge.s32	%p5, %r10, %r11;
	@%p5 bra 	BB6_21;

	mov.u32 	%r98, %r10;

BB6_5:
	sub.s32 	%r58, %r96, %r36;
	sub.s32 	%r59, %r98, %r35;
	mad.lo.s32 	%r16, %r58, %r7, %r59;
	cvta.to.global.u64 	%rd9, %rd5;
	mul.wide.s32 	%rd10, %r16, 4;
	add.s64 	%rd11, %rd9, %rd10;
	shl.b32 	%r60, %r34, 2;
	cvt.s64.s32	%rd12, %r60;
	add.s64 	%rd13, %rd11, %rd12;
	add.s64 	%rd14, %rd13, %rd12;
	ld.global.f32 	%f46, [%rd11];
	mov.f32 	%f47, 0f00000000;
	max.ftz.f32 	%f48, %f46, %f47;
	ld.global.f32 	%f49, [%rd13];
	max.ftz.f32 	%f50, %f49, %f47;
	ld.global.f32 	%f51, [%rd14];
	max.ftz.f32 	%f52, %f51, %f47;
	add.ftz.f32 	%f53, %f48, %f50;
	add.ftz.f32 	%f54, %f53, %f52;
	mul.ftz.f32 	%f5, %f54, 0f3EAAAAAB;
	add.u64 	%rd15, %SP, 0;
	add.u64 	%rd68, %SPL, 0;
	mov.u32 	%r101, 0;
	setp.lt.s32	%p6, %r106, 1;
	@%p6 bra 	BB6_8;

BB6_6:
	ld.local.f32 	%f55, [%rd68];
	setp.gt.ftz.f32	%p7, %f55, %f5;
	@%p7 bra 	BB6_8;

	add.s64 	%rd68, %rd68, 4;
	add.s32 	%r101, %r101, 1;
	setp.lt.s32	%p8, %r101, %r106;
	@%p8 bra 	BB6_6;

BB6_8:
	setp.le.s32	%p9, %r106, %r101;
	@%p9 bra 	BB6_17;

	sub.s32 	%r61, %r106, %r101;
	and.b32  	%r62, %r61, 3;
	setp.eq.s32	%p10, %r62, 0;
	mov.u32 	%r105, %r106;
	@%p10 bra 	BB6_15;

	setp.eq.s32	%p11, %r62, 1;
	mov.u32 	%r103, %r106;
	@%p11 bra 	BB6_14;

	setp.eq.s32	%p12, %r62, 2;
	mov.u32 	%r102, %r106;
	@%p12 bra 	BB6_13;

	add.s32 	%r102, %r106, -1;
	cvta.to.local.u64 	%rd17, %rd15;
	mul.wide.s32 	%rd18, %r106, 4;
	add.s64 	%rd19, %rd17, %rd18;
	ld.local.f32 	%f56, [%rd19+-4];
	st.local.f32 	[%rd19], %f56;

BB6_13:
	add.s32 	%r103, %r102, -1;
	cvta.to.local.u64 	%rd21, %rd15;
	mul.wide.s32 	%rd22, %r102, 4;
	add.s64 	%rd23, %rd21, %rd22;
	ld.local.f32 	%f57, [%rd23+-4];
	st.local.f32 	[%rd23], %f57;

BB6_14:
	add.s32 	%r105, %r103, -1;
	cvta.to.local.u64 	%rd25, %rd15;
	mul.wide.s32 	%rd26, %r103, 4;
	add.s64 	%rd27, %rd25, %rd26;
	ld.local.f32 	%f58, [%rd27+-4];
	st.local.f32 	[%rd27], %f58;

BB6_15:
	setp.lt.u32	%p13, %r61, 4;
	@%p13 bra 	BB6_17;

BB6_16:
	cvta.to.local.u64 	%rd29, %rd15;
	mul.wide.s32 	%rd30, %r105, 4;
	add.s64 	%rd31, %rd29, %rd30;
	ld.local.f32 	%f59, [%rd31+-4];
	ld.local.f32 	%f60, [%rd31+-8];
	ld.local.f32 	%f61, [%rd31+-12];
	ld.local.f32 	%f62, [%rd31+-16];
	st.local.f32 	[%rd31], %f59;
	st.local.f32 	[%rd31+-4], %f60;
	st.local.f32 	[%rd31+-8], %f61;
	st.local.f32 	[%rd31+-12], %f62;
	add.s32 	%r105, %r105, -4;
	setp.gt.s32	%p14, %r105, %r101;
	@%p14 bra 	BB6_16;

BB6_17:
	setp.eq.s32	%p15, %r96, %r4;
	cvta.to.local.u64 	%rd33, %rd15;
	mul.wide.s32 	%rd34, %r101, 4;
	add.s64 	%rd35, %rd33, %rd34;
	st.local.f32 	[%rd35], %f5;
	add.s32 	%r106, %r106, 1;
	mad.lo.s32 	%r73, %r34, 3, %r16;
	mul.wide.s32 	%rd37, %r73, 4;
	add.s64 	%rd38, %rd9, %rd37;
	add.s64 	%rd40, %rd38, %rd12;
	add.s64 	%rd41, %rd40, %rd12;
	ld.global.f32 	%f6, [%rd40];
	ld.global.f32 	%f7, [%rd38];
	add.ftz.f32 	%f63, %f7, %f6;
	ld.global.f32 	%f8, [%rd41];
	add.ftz.f32 	%f64, %f63, %f8;
	mul.ftz.f32 	%f9, %f64, 0f3EAAAAAB;
	setp.eq.s32	%p16, %r98, %r2;
	and.pred  	%p17, %p16, %p15;
	@%p17 bra 	BB6_19;
	bra.uni 	BB6_18;

BB6_19:
	setp.lt.ftz.f32	%p18, %f7, 0f00000000;
	setp.lt.ftz.f32	%p19, %f6, 0f00000000;
	or.pred  	%p20, %p18, %p19;
	setp.lt.ftz.f32	%p21, %f8, 0f00000000;
	or.pred  	%p22, %p20, %p21;
	selp.f32	%f110, 0fBF800000, %f9, %p22;
	bra.uni 	BB6_20;

BB6_18:
	max.ftz.f32 	%f109, %f109, %f9;

BB6_20:
	add.s32 	%r98, %r98, 1;
	setp.lt.s32	%p23, %r98, %r11;
	@%p23 bra 	BB6_5;

BB6_21:
	add.s32 	%r96, %r96, 1;
	setp.lt.s32	%p24, %r96, %r9;
	@%p24 bra 	BB6_3;

BB6_22:
	sub.s32 	%r85, %r4, %r36;
	sub.s32 	%r91, %r2, %r35;
	mad.lo.s32 	%r33, %r85, %r7, %r91;
	cvta.to.global.u64 	%rd42, %rd5;
	mul.wide.s32 	%rd43, %r33, 4;
	add.s64 	%rd44, %rd42, %rd43;
	shl.b32 	%r92, %r34, 2;
	cvt.s64.s32	%rd45, %r92;
	add.s64 	%rd46, %rd44, %rd45;
	add.s64 	%rd47, %rd46, %rd45;
	mad.lo.s32 	%r93, %r34, 3, %r33;
	mul.wide.s32 	%rd48, %r93, 4;
	add.s64 	%rd49, %rd42, %rd48;
	add.s64 	%rd50, %rd49, %rd45;
	add.s64 	%rd51, %rd50, %rd45;
	ld.global.f32 	%f65, [%rd44];
	max.ftz.f32 	%f120, %f65, %f42;
	ld.global.f32 	%f67, [%rd46];
	max.ftz.f32 	%f121, %f67, %f42;
	ld.global.f32 	%f68, [%rd47];
	max.ftz.f32 	%f122, %f68, %f42;
	ld.global.f32 	%f69, [%rd49];
	max.ftz.f32 	%f21, %f69, %f42;
	ld.global.f32 	%f70, [%rd50];
	max.ftz.f32 	%f118, %f70, %f42;
	ld.global.f32 	%f71, [%rd51];
	max.ftz.f32 	%f119, %f71, %f42;
	add.ftz.f32 	%f72, %f120, %f121;
	add.ftz.f32 	%f73, %f72, %f122;
	mul.ftz.f32 	%f24, %f73, 0f3EAAAAAB;
	cvt.rn.f32.s32	%f74, %r106;
	mul.ftz.f32 	%f75, %f74, 0f3F400000;
	cvt.rzi.ftz.s32.f32	%r94, %f75;
	add.u64 	%rd53, %SPL, 0;
	mul.wide.s32 	%rd54, %r94, 4;
	add.s64 	%rd55, %rd53, %rd54;
	ld.local.f32 	%f76, [%rd55];
	add.ftz.f32 	%f77, %f109, 0f38D1B717;
	add.ftz.f32 	%f117, %f77, 0f3727C5AC;
	fma.rn.ftz.f32 	%f26, %f76, 0f40000000, 0fB727C5AC;
	setp.leu.ftz.f32	%p25, %f24, %f26;
	@%p25 bra 	BB6_23;

	setp.lt.ftz.f32	%p26, %f110, 0f00000000;
	mul.ftz.f32 	%f78, %f117, 0f41100000;
	setp.gt.ftz.f32	%p27, %f110, %f78;
	or.pred  	%p28, %p26, %p27;
	cvta.to.global.u64 	%rd56, %rd7;
	add.s64 	%rd4, %rd56, %rd43;
	@%p28 bra 	BB6_28;
	bra.uni 	BB6_25;

BB6_28:
	ld.global.f32 	%f85, [%rd4];
	neg.ftz.f32 	%f86, %f85;
	st.global.f32 	[%rd4], %f86;
	div.approx.ftz.f32 	%f87, %f26, %f24;
	mul.ftz.f32 	%f120, %f120, %f87;
	mul.ftz.f32 	%f121, %f121, %f87;
	mul.ftz.f32 	%f122, %f122, %f87;
	mov.f32 	%f118, %f117;
	mov.f32 	%f119, %f117;
	bra.uni 	BB6_29;

BB6_23:
	mov.f32 	%f117, %f21;
	bra.uni 	BB6_29;

BB6_25:
	sqrt.approx.ftz.f32 	%f79, %f110;
	fma.rn.ftz.f32 	%f80, %f79, 0fC0400000, %f24;
	setp.geu.ftz.f32	%p29, %f80, %f26;
	@%p29 bra 	BB6_26;

	ld.global.f32 	%f81, [%rd4];
	neg.ftz.f32 	%f82, %f81;
	st.global.f32 	[%rd4], %f82;
	div.approx.ftz.f32 	%f83, %f26, %f24;
	mul.ftz.f32 	%f120, %f120, %f83;
	mul.ftz.f32 	%f121, %f121, %f83;
	mul.ftz.f32 	%f122, %f122, %f83;
	mul.ftz.f32 	%f84, %f83, %f83;
	mul.ftz.f32 	%f117, %f21, %f84;
	mul.ftz.f32 	%f118, %f118, %f84;
	mul.ftz.f32 	%f119, %f119, %f84;
	bra.uni 	BB6_29;

BB6_26:
	mov.f32 	%f117, %f21;

BB6_29:
	add.ftz.f32 	%f88, %f120, 0f3F800000;
	mov.f32 	%f89, 0f3F800000;
	div.approx.ftz.f32 	%f90, %f89, %f88;
	add.ftz.f32 	%f91, %f121, 0f3F800000;
	div.approx.ftz.f32 	%f92, %f89, %f91;
	add.ftz.f32 	%f93, %f122, 0f3F800000;
	div.approx.ftz.f32 	%f94, %f89, %f93;
	mul.ftz.f32 	%f95, %f90, %f90;
	mul.ftz.f32 	%f96, %f92, %f92;
	mul.ftz.f32 	%f97, %f94, %f94;
	mul.ftz.f32 	%f98, %f117, %f95;
	mul.ftz.f32 	%f99, %f118, %f96;
	mul.ftz.f32 	%f100, %f119, %f97;
	lg2.approx.ftz.f32 	%f101, %f88;
	mul.ftz.f32 	%f102, %f101, 0f3F317218;
	lg2.approx.ftz.f32 	%f103, %f91;
	mul.ftz.f32 	%f104, %f103, 0f3F317218;
	lg2.approx.ftz.f32 	%f105, %f93;
	mul.ftz.f32 	%f106, %f105, 0f3F317218;
	cvta.to.global.u64 	%rd58, %rd8;
	add.s64 	%rd60, %rd58, %rd43;
	st.global.f32 	[%rd60], %f102;
	add.s64 	%rd62, %rd60, %rd45;
	st.global.f32 	[%rd62], %f104;
	add.s64 	%rd63, %rd62, %rd45;
	st.global.f32 	[%rd63], %f106;
	cvta.to.global.u64 	%rd64, %rd6;
	add.s64 	%rd65, %rd64, %rd43;
	st.global.f32 	[%rd65], %f98;
	add.s64 	%rd66, %rd65, %rd45;
	st.global.f32 	[%rd66], %f99;
	add.s64 	%rd67, %rd66, %rd45;
	st.global.f32 	[%rd67], %f100;

BB6_30:
	ret;
}

	// .globl	kernel_cuda_filter_combine_halves
.visible .entry kernel_cuda_filter_combine_halves(
	.param .u64 kernel_cuda_filter_combine_halves_param_0,
	.param .u64 kernel_cuda_filter_combine_halves_param_1,
	.param .u64 kernel_cuda_filter_combine_halves_param_2,
	.param .u64 kernel_cuda_filter_combine_halves_param_3,
	.param .align 16 .b8 kernel_cuda_filter_combine_halves_param_4[16],
	.param .u32 kernel_cuda_filter_combine_halves_param_5
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.local .align 4 .b8 	__local_depot7[100];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<20>;
	.reg .f32 	%f<48>;
	.reg .b32 	%r<153>;
	.reg .b64 	%rd<60>;


	mov.u64 	%SPL, __local_depot7;
	ld.param.u64 	%rd14, [kernel_cuda_filter_combine_halves_param_0];
	ld.param.u64 	%rd15, [kernel_cuda_filter_combine_halves_param_1];
	ld.param.u64 	%rd16, [kernel_cuda_filter_combine_halves_param_2];
	ld.param.u64 	%rd17, [kernel_cuda_filter_combine_halves_param_3];
	ld.param.v4.u32 	{%r42, %r43, %r44, %r45}, [kernel_cuda_filter_combine_halves_param_4];
	ld.param.u32 	%r41, [kernel_cuda_filter_combine_halves_param_5];
	add.u64 	%rd1, %SPL, 0;
	mov.u32 	%r46, %ctaid.x;
	mov.u32 	%r47, %ntid.x;
	mad.lo.s32 	%r48, %r46, %r47, %r42;
	mov.u32 	%r49, %tid.x;
	add.s32 	%r50, %r48, %r49;
	mov.u32 	%r51, %ctaid.y;
	mov.u32 	%r52, %ntid.y;
	mad.lo.s32 	%r53, %r51, %r52, %r43;
	mov.u32 	%r54, %tid.y;
	add.s32 	%r55, %r53, %r54;
	setp.ge.s32	%p1, %r50, %r44;
	setp.ge.s32	%p2, %r55, %r45;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB7_28;

	mov.u32 	%r56, 3;
	sub.s32 	%r57, %r56, %r42;
	add.s32 	%r58, %r57, %r44;
	and.b32  	%r5, %r58, -4;
	sub.s32 	%r64, %r55, %r43;
	sub.s32 	%r70, %r50, %r42;
	mad.lo.s32 	%r71, %r64, %r5, %r70;
	cvta.to.global.u64 	%rd19, %rd16;
	mul.wide.s32 	%rd20, %r71, 4;
	add.s64 	%rd2, %rd19, %rd20;
	cvta.to.global.u64 	%rd21, %rd17;
	add.s64 	%rd3, %rd21, %rd20;
	setp.eq.s64	%p4, %rd14, 0;
	@%p4 bra 	BB7_3;

	ld.global.f32 	%f3, [%rd3];
	ld.global.f32 	%f4, [%rd2];
	add.ftz.f32 	%f5, %f4, %f3;
	mul.ftz.f32 	%f6, %f5, 0f3F000000;
	cvta.to.global.u64 	%rd22, %rd14;
	add.s64 	%rd24, %rd22, %rd20;
	st.global.f32 	[%rd24], %f6;

BB7_3:
	setp.eq.s64	%p5, %rd15, 0;
	@%p5 bra 	BB7_28;

	setp.eq.s32	%p6, %r41, 0;
	cvta.to.global.u64 	%rd25, %rd15;
	add.s64 	%rd4, %rd25, %rd20;
	@%p6 bra 	BB7_27;

	mov.u32 	%r147, 0;
	st.global.u32 	[%rd4], %r147;
	sub.s32 	%r104, %r55, %r41;
	max.s32 	%r136, %r104, %r43;
	add.s32 	%r7, %r41, 1;
	add.s32 	%r105, %r7, %r55;
	min.s32 	%r8, %r105, %r45;
	setp.ge.s32	%p7, %r136, %r8;
	@%p7 bra 	BB7_21;

	sub.s32 	%r112, %r50, %r41;
	max.s32 	%r9, %r112, %r42;
	add.s32 	%r113, %r7, %r50;
	min.s32 	%r10, %r113, %r44;
	mov.u32 	%r147, 0;

BB7_7:
	setp.le.s32	%p8, %r10, %r9;
	@%p8 bra 	BB7_20;

	sub.s32 	%r115, %r10, %r9;
	and.b32  	%r116, %r115, 3;
	setp.eq.s32	%p9, %r116, 0;
	mov.u32 	%r114, 0;
	sub.s32 	%r117, %r136, %r43;
	mul.lo.s32 	%r118, %r117, %r5;
	sub.s32 	%r13, %r118, %r42;
	@%p9 bra 	BB7_9;
	bra.uni 	BB7_10;

BB7_9:
	mov.u32 	%r145, %r9;
	mov.u32 	%r143, %r147;
	mov.u32 	%r147, %r114;
	bra.uni 	BB7_17;

BB7_10:
	setp.eq.s32	%p10, %r116, 1;
	@%p10 bra 	BB7_11;
	bra.uni 	BB7_12;

BB7_11:
	mov.u32 	%r140, %r9;
	bra.uni 	BB7_16;

BB7_12:
	setp.eq.s32	%p11, %r116, 2;
	@%p11 bra 	BB7_13;
	bra.uni 	BB7_14;

BB7_13:
	mov.u32 	%r138, %r9;
	bra.uni 	BB7_15;

BB7_14:
	add.s32 	%r123, %r13, %r9;
	mul.wide.s32 	%rd28, %r123, 4;
	add.s64 	%rd29, %rd19, %rd28;
	add.s64 	%rd31, %rd21, %rd28;
	ld.global.f32 	%f7, [%rd31];
	ld.global.f32 	%f8, [%rd29];
	sub.ftz.f32 	%f9, %f8, %f7;
	mul.ftz.f32 	%f10, %f9, 0f3E800000;
	mul.ftz.f32 	%f11, %f9, %f10;
	mul.wide.s32 	%rd32, %r147, 4;
	add.s64 	%rd33, %rd1, %rd32;
	st.local.f32 	[%rd33], %f11;
	add.s32 	%r147, %r147, 1;
	add.s32 	%r138, %r9, 1;

BB7_15:
	add.s32 	%r124, %r13, %r138;
	mul.wide.s32 	%rd35, %r124, 4;
	add.s64 	%rd36, %rd19, %rd35;
	add.s64 	%rd38, %rd21, %rd35;
	ld.global.f32 	%f12, [%rd38];
	ld.global.f32 	%f13, [%rd36];
	sub.ftz.f32 	%f14, %f13, %f12;
	mul.ftz.f32 	%f15, %f14, 0f3E800000;
	mul.ftz.f32 	%f16, %f14, %f15;
	mul.wide.s32 	%rd39, %r147, 4;
	add.s64 	%rd40, %rd1, %rd39;
	st.local.f32 	[%rd40], %f16;
	add.s32 	%r147, %r147, 1;
	add.s32 	%r140, %r138, 1;

BB7_16:
	add.s32 	%r125, %r13, %r140;
	mul.wide.s32 	%rd42, %r125, 4;
	add.s64 	%rd43, %rd19, %rd42;
	add.s64 	%rd45, %rd21, %rd42;
	ld.global.f32 	%f17, [%rd45];
	ld.global.f32 	%f18, [%rd43];
	sub.ftz.f32 	%f19, %f18, %f17;
	mul.ftz.f32 	%f20, %f19, 0f3E800000;
	mul.ftz.f32 	%f21, %f19, %f20;
	mul.wide.s32 	%rd46, %r147, 4;
	add.s64 	%rd47, %rd1, %rd46;
	st.local.f32 	[%rd47], %f21;
	add.s32 	%r143, %r147, 1;
	add.s32 	%r145, %r140, 1;
	mov.u32 	%r147, %r143;

BB7_17:
	setp.lt.u32	%p12, %r115, 4;
	@%p12 bra 	BB7_20;

	sub.s32 	%r127, %r145, %r42;
	mad.lo.s32 	%r129, %r5, %r117, %r127;
	mul.wide.s32 	%rd59, %r129, 4;
	mul.wide.s32 	%rd48, %r143, 4;
	add.s64 	%rd58, %rd1, %rd48;
	mov.u32 	%r147, %r143;

BB7_19:
	add.s64 	%rd49, %rd19, %rd59;
	add.s64 	%rd50, %rd21, %rd59;
	ld.global.f32 	%f22, [%rd50];
	ld.global.f32 	%f23, [%rd49];
	sub.ftz.f32 	%f24, %f23, %f22;
	mul.ftz.f32 	%f25, %f24, 0f3E800000;
	mul.ftz.f32 	%f26, %f24, %f25;
	ld.global.f32 	%f27, [%rd50+4];
	ld.global.f32 	%f28, [%rd49+4];
	ld.global.f32 	%f29, [%rd50+8];
	ld.global.f32 	%f30, [%rd49+8];
	ld.global.f32 	%f31, [%rd50+12];
	ld.global.f32 	%f32, [%rd49+12];
	st.local.f32 	[%rd58], %f26;
	sub.ftz.f32 	%f33, %f28, %f27;
	mul.ftz.f32 	%f34, %f33, 0f3E800000;
	mul.ftz.f32 	%f35, %f33, %f34;
	st.local.f32 	[%rd58+4], %f35;
	sub.ftz.f32 	%f36, %f30, %f29;
	mul.ftz.f32 	%f37, %f36, 0f3E800000;
	mul.ftz.f32 	%f38, %f36, %f37;
	st.local.f32 	[%rd58+8], %f38;
	sub.ftz.f32 	%f39, %f32, %f31;
	mul.ftz.f32 	%f40, %f39, 0f3E800000;
	mul.ftz.f32 	%f41, %f39, %f40;
	st.local.f32 	[%rd58+12], %f41;
	add.s32 	%r147, %r147, 4;
	add.s64 	%rd59, %rd59, 16;
	add.s64 	%rd58, %rd58, 16;
	add.s32 	%r145, %r145, 4;
	setp.lt.s32	%p13, %r145, %r10;
	@%p13 bra 	BB7_19;

BB7_20:
	add.s32 	%r136, %r136, 1;
	setp.lt.s32	%p14, %r136, %r8;
	@%p14 bra 	BB7_7;

BB7_21:
	mov.u32 	%r149, 1;
	setp.lt.s32	%p15, %r147, 2;
	@%p15 bra 	BB7_26;

BB7_22:
	mul.wide.s32 	%rd51, %r149, 4;
	add.s64 	%rd52, %rd1, %rd51;
	ld.local.f32 	%f1, [%rd52];
	add.s32 	%r150, %r149, -1;
	setp.lt.s32	%p16, %r150, 0;
	mov.u32 	%r152, %r149;
	@%p16 bra 	BB7_25;

BB7_23:
	mov.u32 	%r36, %r150;
	mul.wide.s32 	%rd53, %r36, 4;
	add.s64 	%rd13, %rd1, %rd53;
	ld.local.f32 	%f2, [%rd13];
	setp.leu.ftz.f32	%p17, %f2, %f1;
	@%p17 bra 	BB7_25;

	st.local.f32 	[%rd13+4], %f2;
	add.s32 	%r150, %r36, -1;
	setp.gt.s32	%p18, %r150, -1;
	mov.u32 	%r152, %r36;
	@%p18 bra 	BB7_23;

BB7_25:
	mul.wide.s32 	%rd54, %r152, 4;
	add.s64 	%rd55, %rd1, %rd54;
	st.local.f32 	[%rd55], %f1;
	add.s32 	%r149, %r149, 1;
	setp.lt.s32	%p19, %r149, %r147;
	@%p19 bra 	BB7_22;

BB7_26:
	mul.lo.s32 	%r131, %r147, 7;
	shr.s32 	%r132, %r131, 31;
	shr.u32 	%r133, %r132, 29;
	add.s32 	%r134, %r131, %r133;
	shr.s32 	%r135, %r134, 3;
	mul.wide.s32 	%rd56, %r135, 4;
	add.s64 	%rd57, %rd1, %rd56;
	ld.local.f32 	%f42, [%rd57];
	st.global.f32 	[%rd4], %f42;
	bra.uni 	BB7_28;

BB7_27:
	ld.global.f32 	%f43, [%rd2];
	ld.global.f32 	%f44, [%rd3];
	sub.ftz.f32 	%f45, %f43, %f44;
	mul.ftz.f32 	%f46, %f45, 0f3E800000;
	mul.ftz.f32 	%f47, %f45, %f46;
	st.global.f32 	[%rd4], %f47;

BB7_28:
	ret;
}

	// .globl	kernel_cuda_filter_construct_transform
.visible .entry kernel_cuda_filter_construct_transform(
	.param .u64 kernel_cuda_filter_construct_transform_param_0,
	.param .u64 kernel_cuda_filter_construct_transform_param_1,
	.param .u64 kernel_cuda_filter_construct_transform_param_2,
	.param .u64 kernel_cuda_filter_construct_transform_param_3,
	.param .align 16 .b8 kernel_cuda_filter_construct_transform_param_4[16],
	.param .align 16 .b8 kernel_cuda_filter_construct_transform_param_5[16],
	.param .u32 kernel_cuda_filter_construct_transform_param_6,
	.param .f32 kernel_cuda_filter_construct_transform_param_7,
	.param .u32 kernel_cuda_filter_construct_transform_param_8,
	.param .u32 kernel_cuda_filter_construct_transform_param_9,
	.param .u8 kernel_cuda_filter_construct_transform_param_10
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.local .align 4 .b8 	__local_depot8[572];
	.reg .b64 	%SP;
	.reg .b64 	%SPL;
	.reg .pred 	%p<219>;
	.reg .b16 	%rs<7>;
	.reg .f32 	%f<767>;
	.reg .b32 	%r<899>;
	.reg .b64 	%rd<455>;
	// demoted variable
	.shared .align 4 .b8 _ZZ33kernel_filter_construct_transformPKfP8TileInfoii4int4iibPfPiifiiE15shared_features[11264];

	mov.u64 	%SPL, __local_depot8;
	ld.param.u64 	%rd182, [kernel_cuda_filter_construct_transform_param_0];
	ld.param.u64 	%rd183, [kernel_cuda_filter_construct_transform_param_1];
	ld.param.u64 	%rd185, [kernel_cuda_filter_construct_transform_param_2];
	ld.param.v4.u32 	{%r446, %r447, %r448, %r449}, [kernel_cuda_filter_construct_transform_param_4];
	ld.param.v4.u32 	{%r450, %r451, %r452, %r453}, [kernel_cuda_filter_construct_transform_param_5];
	ld.param.u32 	%r443, [kernel_cuda_filter_construct_transform_param_6];
	ld.param.f32 	%f135, [kernel_cuda_filter_construct_transform_param_7];
	ld.param.u32 	%r444, [kernel_cuda_filter_construct_transform_param_8];
	ld.param.u32 	%r445, [kernel_cuda_filter_construct_transform_param_9];
	ld.param.s8 	%rs1, [kernel_cuda_filter_construct_transform_param_10];
	cvta.to.global.u64 	%rd1, %rd185;
	cvta.to.global.u64 	%rd2, %rd183;
	add.u64 	%rd3, %SPL, 0;
	add.u64 	%rd4, %SPL, 44;
	add.u64 	%rd5, %SPL, 88;
	mov.u32 	%r454, %ctaid.x;
	mov.u32 	%r1, %ntid.x;
	mov.u32 	%r2, %tid.x;
	mad.lo.s32 	%r3, %r454, %r1, %r2;
	mov.u32 	%r455, %ctaid.y;
	mov.u32 	%r456, %ntid.y;
	mov.u32 	%r4, %tid.y;
	mad.lo.s32 	%r5, %r455, %r456, %r4;
	setp.ge.s32	%p1, %r3, %r448;
	setp.ge.s32	%p2, %r5, %r449;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB8_303;

	cvta.to.global.u64 	%rd6, %rd182;
	mad.lo.s32 	%r8, %r5, %r448, %r3;
	add.s32 	%r458, %r3, %r446;
	add.s32 	%r459, %r5, %r447;
	mul.lo.s32 	%r9, %r448, %r449;
	mad.lo.s32 	%r460, %r4, %r1, %r2;
	mov.u32 	%r461, 3;
	sub.s32 	%r462, %r461, %r450;
	add.s32 	%r463, %r462, %r452;
	and.b32  	%r11, %r463, -4;
	mul.lo.s32 	%r12, %r460, 11;
	shl.b32 	%r464, %r12, 2;
	mov.u32 	%r465, _ZZ33kernel_filter_construct_transformPKfP8TileInfoii4int4iibPfPiifiiE15shared_features;
	add.s32 	%r13, %r465, %r464;
	and.b16  	%rs2, %rs1, 255;
	setp.eq.s16	%p4, %rs2, 0;
	selp.b32	%r14, 10, 11, %p4;
	sub.s32 	%r466, %r458, %r443;
	max.s32 	%r15, %r450, %r466;
	sub.s32 	%r467, %r459, %r443;
	max.s32 	%r17, %r451, %r467;
	add.s32 	%r468, %r443, 1;
	add.s32 	%r469, %r468, %r458;
	min.s32 	%r18, %r452, %r469;
	add.s32 	%r470, %r468, %r459;
	min.s32 	%r19, %r453, %r470;
	sub.s32 	%r471, %r19, %r17;
	sub.s32 	%r472, %r18, %r15;
	mul.lo.s32 	%r473, %r471, %r472;
	ld.global.u32 	%r20, [%rd2+172];
	mul.lo.s32 	%r21, %r473, %r20;
	mov.u32 	%r457, 0;
	mov.u64 	%rd410, %rd3;
	mov.u32 	%r749, %r457;

BB8_2:
	st.local.u32 	[%rd410], %r457;
	add.s64 	%rd410, %rd410, 4;
	add.s32 	%r749, %r749, 1;
	setp.lt.s32	%p5, %r749, %r14;
	@%p5 bra 	BB8_2;

	setp.lt.s32	%p6, %r20, 1;
	@%p6 bra 	BB8_15;

	sub.s32 	%r476, %r17, %r451;
	sub.s32 	%r477, %r15, %r450;
	mad.lo.s32 	%r24, %r476, %r11, %r477;
	add.s32 	%r478, %r15, %r11;
	sub.s32 	%r479, %r478, %r18;
	cvt.s64.s32	%rd9, %r479;
	cvt.s64.s32	%rd10, %r444;
	mul.wide.s32 	%rd11, %r444, 4;
	mov.u32 	%r750, 0;
	shl.b64 	%rd192, %rd10, 2;
	shl.b64 	%rd202, %rd9, 2;

BB8_5:
	mad.lo.s32 	%r480, %r750, %r445, %r24;
	mul.wide.s32 	%rd189, %r480, 4;
	add.s64 	%rd413, %rd6, %rd189;
	setp.le.s32	%p7, %r19, %r17;
	@%p7 bra 	BB8_14;

	mul.wide.s32 	%rd190, %r750, 4;
	add.s64 	%rd191, %rd2, %rd190;
	ld.global.u32 	%r481, [%rd191+108];
	cvt.rn.f32.s32	%f1, %r481;
	mov.u32 	%r751, %r17;

BB8_7:
	setp.le.s32	%p8, %r18, %r15;
	@%p8 bra 	BB8_13;

	mov.u32 	%r752, %r15;

BB8_9:
	cvt.rn.f32.s32	%f729, %r751;
	cvt.rn.f32.s32	%f136, %r752;
	st.shared.f32 	[%r13], %f136;
	st.shared.f32 	[%r13+4], %f729;
	ld.global.nc.f32 	%f137, [%rd413];
	abs.ftz.f32 	%f138, %f137;
	st.shared.f32 	[%r13+8], %f138;
	add.s64 	%rd193, %rd413, %rd192;
	ld.global.nc.f32 	%f139, [%rd193];
	st.shared.f32 	[%r13+12], %f139;
	add.s64 	%rd194, %rd193, %rd11;
	ld.global.nc.f32 	%f140, [%rd194];
	st.shared.f32 	[%r13+16], %f140;
	add.s64 	%rd195, %rd194, %rd11;
	ld.global.nc.f32 	%f141, [%rd195];
	st.shared.f32 	[%r13+20], %f141;
	add.s64 	%rd196, %rd195, %rd11;
	ld.global.nc.f32 	%f142, [%rd196];
	st.shared.f32 	[%r13+24], %f142;
	add.s64 	%rd197, %rd196, %rd11;
	ld.global.nc.f32 	%f143, [%rd197];
	st.shared.f32 	[%r13+28], %f143;
	add.s64 	%rd198, %rd197, %rd11;
	ld.global.nc.f32 	%f144, [%rd198];
	st.shared.f32 	[%r13+32], %f144;
	add.s64 	%rd199, %rd198, %rd11;
	ld.global.nc.f32 	%f145, [%rd199];
	st.shared.f32 	[%r13+36], %f145;
	mov.u32 	%r753, 0;
	@%p4 bra 	BB8_11;

	st.shared.f32 	[%r13+40], %f1;

BB8_11:
	add.s32 	%r484, %r753, %r12;
	shl.b32 	%r485, %r484, 2;
	add.s32 	%r487, %r465, %r485;
	mul.wide.s32 	%rd200, %r753, 4;
	add.s64 	%rd201, %rd3, %rd200;
	ld.local.f32 	%f146, [%rd201];
	ld.shared.f32 	%f147, [%r487];
	add.ftz.f32 	%f148, %f147, %f146;
	st.local.f32 	[%rd201], %f148;
	add.s32 	%r753, %r753, 1;
	setp.lt.s32	%p10, %r753, %r14;
	@%p10 bra 	BB8_11;

	add.s32 	%r752, %r752, 1;
	add.s64 	%rd413, %rd413, 4;
	setp.lt.s32	%p11, %r752, %r18;
	@%p11 bra 	BB8_9;

BB8_13:
	add.s64 	%rd413, %rd413, %rd202;
	add.s32 	%r751, %r751, 1;
	setp.lt.s32	%p12, %r751, %r19;
	@%p12 bra 	BB8_7;

BB8_14:
	add.s32 	%r750, %r750, 1;
	setp.lt.s32	%p13, %r750, %r20;
	@%p13 bra 	BB8_5;

BB8_15:
	cvt.rn.f32.s32	%f149, %r21;
	rcp.approx.ftz.f32 	%f3, %f149;
	mov.u32 	%r488, 0;
	mov.u64 	%rd414, %rd3;
	mov.u32 	%r754, %r488;

BB8_16:
	ld.local.f32 	%f150, [%rd414];
	mul.ftz.f32 	%f151, %f3, %f150;
	st.local.f32 	[%rd414], %f151;
	add.s64 	%rd414, %rd414, 4;
	add.s32 	%r754, %r754, 1;
	setp.lt.s32	%p14, %r754, %r14;
	@%p14 bra 	BB8_16;

	mov.u64 	%rd415, %rd4;
	mov.u32 	%r755, %r488;

BB8_18:
	st.local.u32 	[%rd415], %r488;
	add.s64 	%rd415, %rd415, 4;
	add.s32 	%r755, %r755, 1;
	setp.lt.s32	%p15, %r755, %r14;
	@%p15 bra 	BB8_18;

	setp.lt.s32	%p218, %r20, 1;
	@%p218 bra 	BB8_31;

	mov.u32 	%r756, 0;

BB8_21:
	sub.s32 	%r492, %r17, %r451;
	sub.s32 	%r493, %r15, %r450;
	mad.lo.s32 	%r494, %r492, %r11, %r493;
	mad.lo.s32 	%r495, %r756, %r445, %r494;
	mul.wide.s32 	%rd204, %r495, 4;
	add.s64 	%rd418, %rd6, %rd204;
	setp.le.s32	%p17, %r19, %r17;
	@%p17 bra 	BB8_30;

	mul.wide.s32 	%rd205, %r756, 4;
	add.s64 	%rd206, %rd2, %rd205;
	ld.global.u32 	%r496, [%rd206+108];
	cvt.rn.f32.s32	%f4, %r496;
	mov.u32 	%r757, %r17;

BB8_23:
	setp.le.s32	%p18, %r18, %r15;
	@%p18 bra 	BB8_29;

	ld.local.f32 	%f5, [%rd3];
	ld.local.f32 	%f152, [%rd3+4];
	cvt.rn.f32.s32	%f153, %r757;
	sub.ftz.f32 	%f154, %f153, %f152;
	abs.ftz.f32 	%f6, %f154;
	ld.local.f32 	%f7, [%rd3+8];
	ld.local.f32 	%f8, [%rd3+12];
	ld.local.f32 	%f9, [%rd3+16];
	ld.local.f32 	%f10, [%rd3+20];
	ld.local.f32 	%f11, [%rd3+24];
	ld.local.f32 	%f12, [%rd3+28];
	ld.local.f32 	%f13, [%rd3+32];
	ld.local.f32 	%f14, [%rd3+36];
	mov.u32 	%r758, %r15;

BB8_25:
	cvt.rn.f32.s32	%f155, %r758;
	sub.ftz.f32 	%f156, %f155, %f5;
	abs.ftz.f32 	%f157, %f156;
	st.shared.f32 	[%r13], %f157;
	st.shared.f32 	[%r13+4], %f6;
	ld.global.nc.f32 	%f158, [%rd418];
	abs.ftz.f32 	%f159, %f158;
	sub.ftz.f32 	%f160, %f159, %f7;
	abs.ftz.f32 	%f161, %f160;
	st.shared.f32 	[%r13+8], %f161;
	mul.wide.s32 	%rd207, %r444, 4;
	add.s64 	%rd208, %rd418, %rd207;
	ld.global.nc.f32 	%f162, [%rd208];
	sub.ftz.f32 	%f163, %f162, %f8;
	add.s64 	%rd209, %rd208, %rd207;
	ld.global.nc.f32 	%f164, [%rd209];
	sub.ftz.f32 	%f165, %f164, %f9;
	add.s64 	%rd210, %rd209, %rd207;
	ld.global.nc.f32 	%f166, [%rd210];
	sub.ftz.f32 	%f167, %f166, %f10;
	mul.ftz.f32 	%f168, %f165, %f165;
	fma.rn.ftz.f32 	%f169, %f163, %f163, %f168;
	fma.rn.ftz.f32 	%f170, %f167, %f167, %f169;
	st.shared.f32 	[%r13+12], %f170;
	add.s64 	%rd211, %rd210, %rd207;
	ld.global.nc.f32 	%f171, [%rd211];
	sub.ftz.f32 	%f172, %f171, %f11;
	abs.ftz.f32 	%f173, %f172;
	st.shared.f32 	[%r13+16], %f173;
	add.s64 	%rd212, %rd211, %rd207;
	ld.global.nc.f32 	%f174, [%rd212];
	sub.ftz.f32 	%f175, %f174, %f12;
	add.s64 	%rd213, %rd212, %rd207;
	ld.global.nc.f32 	%f176, [%rd213];
	sub.ftz.f32 	%f177, %f176, %f13;
	add.s64 	%rd214, %rd213, %rd207;
	ld.global.nc.f32 	%f178, [%rd214];
	sub.ftz.f32 	%f179, %f178, %f14;
	mul.ftz.f32 	%f180, %f177, %f177;
	fma.rn.ftz.f32 	%f181, %f175, %f175, %f180;
	fma.rn.ftz.f32 	%f182, %f179, %f179, %f181;
	st.shared.f32 	[%r13+20], %f182;
	mov.u32 	%r759, 0;
	@%p4 bra 	BB8_27;

	ld.local.f32 	%f183, [%rd3+40];
	sub.ftz.f32 	%f184, %f4, %f183;
	abs.ftz.f32 	%f185, %f184;
	st.shared.f32 	[%r13+24], %f185;

BB8_27:
	mul.wide.s32 	%rd215, %r759, 4;
	add.s64 	%rd216, %rd4, %rd215;
	add.s32 	%r515, %r759, %r12;
	shl.b32 	%r516, %r515, 2;
	add.s32 	%r518, %r465, %r516;
	ld.shared.f32 	%f186, [%r518];
	ld.local.f32 	%f187, [%rd216];
	max.ftz.f32 	%f188, %f187, %f186;
	st.local.f32 	[%rd216], %f188;
	add.s32 	%r759, %r759, 1;
	setp.lt.s32	%p20, %r759, %r14;
	@%p20 bra 	BB8_27;

	add.s32 	%r758, %r758, 1;
	add.s64 	%rd418, %rd418, 4;
	setp.lt.s32	%p21, %r758, %r18;
	@%p21 bra 	BB8_25;

BB8_29:
	add.s32 	%r519, %r15, %r11;
	sub.s32 	%r520, %r519, %r18;
	mul.wide.s32 	%rd217, %r520, 4;
	add.s64 	%rd418, %rd418, %rd217;
	add.s32 	%r757, %r757, 1;
	setp.lt.s32	%p22, %r757, %r19;
	@%p22 bra 	BB8_23;

BB8_30:
	add.s32 	%r756, %r756, 1;
	setp.lt.s32	%p23, %r756, %r20;
	@%p23 bra 	BB8_21;

BB8_31:
	ld.local.f32 	%f189, [%rd4];
	mov.f32 	%f190, 0f3C23D70A;
	max.ftz.f32 	%f191, %f189, %f190;
	rcp.approx.ftz.f32 	%f192, %f191;
	ld.local.f32 	%f193, [%rd4+4];
	ld.local.f32 	%f194, [%rd4+8];
	st.local.f32 	[%rd4], %f192;
	max.ftz.f32 	%f195, %f193, %f190;
	rcp.approx.ftz.f32 	%f196, %f195;
	st.local.f32 	[%rd4+4], %f196;
	max.ftz.f32 	%f197, %f194, %f190;
	rcp.approx.ftz.f32 	%f198, %f197;
	st.local.f32 	[%rd4+8], %f198;
	@%p4 bra 	BB8_33;

	ld.local.f32 	%f199, [%rd4+24];
	max.ftz.f32 	%f201, %f199, %f190;
	rcp.approx.ftz.f32 	%f202, %f201;
	st.local.f32 	[%rd4+40], %f202;

BB8_33:
	ld.local.f32 	%f203, [%rd4+16];
	max.ftz.f32 	%f205, %f203, %f190;
	rcp.approx.ftz.f32 	%f206, %f205;
	ld.local.f32 	%f207, [%rd4+20];
	ld.local.f32 	%f208, [%rd4+12];
	st.local.f32 	[%rd4+24], %f206;
	sqrt.approx.ftz.f32 	%f209, %f207;
	max.ftz.f32 	%f210, %f209, %f190;
	rcp.approx.ftz.f32 	%f211, %f210;
	st.local.f32 	[%rd4+36], %f211;
	st.local.f32 	[%rd4+32], %f211;
	st.local.f32 	[%rd4+28], %f211;
	sqrt.approx.ftz.f32 	%f212, %f208;
	max.ftz.f32 	%f213, %f212, %f190;
	rcp.approx.ftz.f32 	%f214, %f213;
	st.local.f32 	[%rd4+20], %f214;
	st.local.f32 	[%rd4+16], %f214;
	st.local.f32 	[%rd4+12], %f214;
	mov.u32 	%r760, 0;

BB8_34:
	setp.lt.s32	%p25, %r760, 0;
	@%p25 bra 	BB8_37;

	mul.lo.s32 	%r523, %r14, %r760;
	mul.wide.s32 	%rd218, %r523, 4;
	add.s64 	%rd419, %rd5, %rd218;
	mov.u32 	%r761, -1;

BB8_36:
	mov.u32 	%r524, 0;
	st.local.u32 	[%rd419], %r524;
	add.s64 	%rd419, %rd419, 4;
	add.s32 	%r761, %r761, 1;
	setp.lt.s32	%p26, %r761, %r760;
	@%p26 bra 	BB8_36;

BB8_37:
	add.s32 	%r760, %r760, 1;
	setp.lt.s32	%p27, %r760, %r14;
	@%p27 bra 	BB8_34;

	setp.lt.s32	%p217, %r20, 1;
	@%p217 bra 	BB8_55;

	sub.s32 	%r526, %r17, %r451;
	sub.s32 	%r527, %r15, %r450;
	mad.lo.s32 	%r49, %r526, %r11, %r527;
	add.s32 	%r528, %r15, %r11;
	sub.s32 	%r529, %r528, %r18;
	cvt.s64.s32	%rd33, %r529;
	cvt.s64.s32	%rd34, %r444;
	mul.wide.s32 	%rd35, %r444, 4;
	mov.u32 	%r762, 0;
	shl.b64 	%rd222, %rd34, 2;
	shl.b64 	%rd236, %rd33, 2;

BB8_40:
	mad.lo.s32 	%r530, %r762, %r445, %r49;
	mul.wide.s32 	%rd219, %r530, 4;
	add.s64 	%rd422, %rd6, %rd219;
	setp.le.s32	%p29, %r19, %r17;
	@%p29 bra 	BB8_54;

	mul.wide.s32 	%rd220, %r762, 4;
	add.s64 	%rd221, %rd2, %rd220;
	ld.global.u32 	%r531, [%rd221+108];
	cvt.rn.f32.s32	%f15, %r531;
	mov.u32 	%r763, %r17;

BB8_42:
	setp.le.s32	%p30, %r18, %r15;
	@%p30 bra 	BB8_53;

	mov.u32 	%r764, %r15;

BB8_44:
	cvt.rn.f32.s32	%f730, %r763;
	cvt.rn.f32.s32	%f215, %r764;
	st.shared.f32 	[%r13], %f215;
	st.shared.f32 	[%r13+4], %f730;
	ld.global.nc.f32 	%f216, [%rd422];
	abs.ftz.f32 	%f217, %f216;
	st.shared.f32 	[%r13+8], %f217;
	add.s64 	%rd223, %rd422, %rd222;
	ld.global.nc.f32 	%f218, [%rd223];
	st.shared.f32 	[%r13+12], %f218;
	add.s64 	%rd224, %rd223, %rd35;
	ld.global.nc.f32 	%f219, [%rd224];
	st.shared.f32 	[%r13+16], %f219;
	add.s64 	%rd225, %rd224, %rd35;
	ld.global.nc.f32 	%f220, [%rd225];
	st.shared.f32 	[%r13+20], %f220;
	add.s64 	%rd226, %rd225, %rd35;
	ld.global.nc.f32 	%f221, [%rd226];
	st.shared.f32 	[%r13+24], %f221;
	add.s64 	%rd227, %rd226, %rd35;
	ld.global.nc.f32 	%f222, [%rd227];
	st.shared.f32 	[%r13+28], %f222;
	add.s64 	%rd228, %rd227, %rd35;
	ld.global.nc.f32 	%f223, [%rd228];
	st.shared.f32 	[%r13+32], %f223;
	add.s64 	%rd229, %rd228, %rd35;
	ld.global.nc.f32 	%f224, [%rd229];
	st.shared.f32 	[%r13+36], %f224;
	mov.u32 	%r765, 0;
	@%p4 bra 	BB8_46;

	mov.u32 	%r765, 0;
	st.shared.f32 	[%r13+40], %f15;

BB8_46:
	mov.u32 	%r766, 0;
	mul.wide.s32 	%rd230, %r765, 4;
	add.s64 	%rd231, %rd3, %rd230;
	add.s32 	%r545, %r765, %r12;
	shl.b32 	%r546, %r545, 2;
	add.s32 	%r548, %r465, %r546;
	ld.shared.f32 	%f225, [%r548];
	ld.local.f32 	%f226, [%rd231];
	sub.ftz.f32 	%f227, %f225, %f226;
	st.shared.f32 	[%r548], %f227;
	add.s32 	%r765, %r765, 1;
	setp.lt.s32	%p32, %r765, %r14;
	@%p32 bra 	BB8_46;

BB8_47:
	mov.u32 	%r767, 0;
	mul.wide.s32 	%rd232, %r766, 4;
	add.s64 	%rd233, %rd4, %rd232;
	add.s32 	%r550, %r766, %r12;
	shl.b32 	%r551, %r550, 2;
	add.s32 	%r553, %r465, %r551;
	ld.shared.f32 	%f228, [%r553];
	ld.local.f32 	%f229, [%rd233];
	mul.ftz.f32 	%f230, %f229, %f228;
	st.shared.f32 	[%r553], %f230;
	add.s32 	%r766, %r766, 1;
	setp.lt.s32	%p33, %r766, %r14;
	@%p33 bra 	BB8_47;

BB8_48:
	setp.lt.s32	%p34, %r767, 0;
	@%p34 bra 	BB8_51;

	add.s32 	%r555, %r767, %r12;
	shl.b32 	%r556, %r555, 2;
	add.s32 	%r558, %r465, %r556;
	ld.shared.f32 	%f17, [%r558];
	mul.lo.s32 	%r59, %r767, %r14;
	mov.u32 	%r768, 0;

BB8_50:
	add.s32 	%r559, %r768, %r12;
	shl.b32 	%r560, %r559, 2;
	add.s32 	%r562, %r465, %r560;
	ld.shared.f32 	%f231, [%r562];
	add.s32 	%r563, %r768, %r59;
	mul.wide.s32 	%rd234, %r563, 4;
	add.s64 	%rd235, %rd5, %rd234;
	ld.local.f32 	%f232, [%rd235];
	fma.rn.ftz.f32 	%f233, %f17, %f231, %f232;
	st.local.f32 	[%rd235], %f233;
	add.s32 	%r61, %r768, 1;
	setp.lt.s32	%p35, %r768, %r767;
	mov.u32 	%r768, %r61;
	@%p35 bra 	BB8_50;

BB8_51:
	add.s32 	%r767, %r767, 1;
	setp.lt.s32	%p36, %r767, %r14;
	@%p36 bra 	BB8_48;

	add.s32 	%r764, %r764, 1;
	add.s64 	%rd422, %rd422, 4;
	setp.lt.s32	%p37, %r764, %r18;
	@%p37 bra 	BB8_44;

BB8_53:
	add.s64 	%rd422, %rd422, %rd236;
	add.s32 	%r763, %r763, 1;
	setp.lt.s32	%p38, %r763, %r19;
	@%p38 bra 	BB8_42;

BB8_54:
	add.s32 	%r762, %r762, 1;
	setp.lt.s32	%p39, %r762, %r20;
	@%p39 bra 	BB8_40;

BB8_55:
	mul.lo.s32 	%r67, %r9, %r14;
	mov.u32 	%r564, 0;
	mov.u32 	%r769, %r564;

BB8_56:
	neg.s32 	%r771, %r769;
	mul.lo.s32 	%r770, %r67, %r769;
	mov.u32 	%r772, %r564;

BB8_57:
	setp.eq.s32	%p40, %r771, 0;
	selp.f32	%f234, 0f3F800000, 0f00000000, %p40;
	add.s32 	%r566, %r770, %r8;
	mul.wide.s32 	%rd237, %r566, 4;
	add.s64 	%rd238, %rd1, %rd237;
	st.global.f32 	[%rd238], %f234;
	add.s32 	%r771, %r771, 1;
	add.s32 	%r770, %r770, %r9;
	add.s32 	%r772, %r772, 1;
	setp.lt.s32	%p41, %r772, %r14;
	@%p41 bra 	BB8_57;

	add.s32 	%r769, %r769, 1;
	setp.lt.s32	%p42, %r769, %r14;
	@%p42 bra 	BB8_56;

	mul.lo.s32 	%r568, %r14, %r14;
	cvt.rn.f32.s32	%f18, %r568;
	mov.u32 	%r773, 1;
	mov.f32 	%f733, 0f00000000;

BB8_60:
	setp.lt.s32	%p43, %r773, 1;
	@%p43 bra 	BB8_63;

	mul.lo.s32 	%r570, %r14, %r773;
	mul.wide.s32 	%rd239, %r570, 4;
	add.s64 	%rd423, %rd5, %rd239;
	mov.u32 	%r774, 0;

BB8_62:
	ld.local.f32 	%f236, [%rd423];
	abs.ftz.f32 	%f237, %f236;
	add.ftz.f32 	%f733, %f733, %f237;
	add.s64 	%rd423, %rd423, 4;
	add.s32 	%r774, %r774, 1;
	setp.lt.s32	%p44, %r774, %r773;
	@%p44 bra 	BB8_62;

BB8_63:
	add.s32 	%r773, %r773, 1;
	setp.lt.s32	%p45, %r773, %r14;
	@%p45 bra 	BB8_60;

	setp.lt.ftz.f32	%p46, %f733, 0f33D6BF95;
	@%p46 bra 	BB8_272;

	mul.ftz.f32 	%f238, %f733, 0f3E4CCCCD;
	div.approx.ftz.f32 	%f23, %f238, %f18;
	add.s64 	%rd47, %rd5, 4;
	mov.u32 	%r775, 1;

BB8_66:
	setp.lt.s32	%p47, %r775, 1;
	@%p47 bra 	BB8_84;

	mul.lo.s32 	%r84, %r775, %r14;
	add.s32 	%r573, %r84, %r775;
	mul.wide.s32 	%rd240, %r573, 4;
	add.s64 	%rd48, %rd5, %rd240;
	mul.wide.s32 	%rd241, %r84, 4;
	add.s64 	%rd49, %rd5, %rd241;
	add.s32 	%r574, %r775, 1;
	mul.lo.s32 	%r85, %r14, %r574;
	add.s32 	%r86, %r775, %r85;
	mul.lo.s32 	%r87, %r84, %r9;
	mov.u32 	%r776, 0;

BB8_68:
	add.s32 	%r575, %r776, %r84;
	mul.wide.s32 	%rd242, %r575, 4;
	add.s64 	%rd50, %rd5, %rd242;
	ld.local.f32 	%f24, [%rd50];
	abs.ftz.f32 	%f25, %f24;
	setp.eq.ftz.f32	%p48, %f24, 0f00000000;
	setp.lt.ftz.f32	%p49, %f25, %f23;
	or.pred  	%p50, %p48, %p49;
	@%p50 bra 	BB8_83;

	ld.local.f32 	%f26, [%rd48];
	mul.lo.s32 	%r89, %r776, %r14;
	add.s32 	%r576, %r89, %r776;
	mul.wide.s32 	%rd243, %r576, 4;
	add.s64 	%rd51, %rd5, %rd243;
	ld.local.f32 	%f239, [%rd51];
	sub.ftz.f32 	%f27, %f26, %f239;
	abs.ftz.f32 	%f240, %f27;
	mul.ftz.f32 	%f241, %f240, 0f3089705F;
	setp.gt.ftz.f32	%p51, %f25, %f241;
	@%p51 bra 	BB8_71;
	bra.uni 	BB8_70;

BB8_71:
	mul.ftz.f32 	%f242, %f27, 0f3F000000;
	div.approx.ftz.f32 	%f243, %f242, %f24;
	abs.ftz.f32 	%f244, %f243;
	fma.rn.ftz.f32 	%f245, %f243, %f243, 0f3F800000;
	sqrt.approx.ftz.f32 	%f246, %f245;
	add.ftz.f32 	%f247, %f244, %f246;
	rcp.approx.ftz.f32 	%f248, %f247;
	setp.lt.ftz.f32	%p52, %f243, 0f00000000;
	neg.ftz.f32 	%f249, %f248;
	selp.f32	%f734, %f249, %f248, %p52;
	bra.uni 	BB8_72;

BB8_70:
	div.approx.ftz.f32 	%f734, %f24, %f27;

BB8_72:
	fma.rn.ftz.f32 	%f250, %f734, %f734, 0f3F800000;
	rsqrt.approx.ftz.f32 	%f251, %f250;
	mul.ftz.f32 	%f31, %f734, %f251;
	add.ftz.f32 	%f252, %f251, 0f3F800000;
	div.approx.ftz.f32 	%f32, %f31, %f252;
	mul.ftz.f32 	%f253, %f24, %f734;
	add.ftz.f32 	%f254, %f253, %f26;
	st.local.f32 	[%rd48], %f254;
	ld.local.f32 	%f255, [%rd51];
	sub.ftz.f32 	%f256, %f255, %f253;
	st.local.f32 	[%rd51], %f256;
	mov.u32 	%r785, 0;
	st.local.u32 	[%rd50], %r785;
	setp.lt.s32	%p53, %r776, 1;
	@%p53 bra 	BB8_75;

	mul.lo.s32 	%r579, %r14, %r776;
	mul.wide.s32 	%rd244, %r579, 4;
	add.s64 	%rd425, %rd5, %rd244;
	mov.u32 	%r777, 0;
	mov.u64 	%rd424, %rd49;

BB8_74:
	ld.local.f32 	%f257, [%rd425];
	ld.local.f32 	%f258, [%rd424];
	fma.rn.ftz.f32 	%f259, %f32, %f257, %f258;
	mul.ftz.f32 	%f260, %f31, %f259;
	sub.ftz.f32 	%f261, %f257, %f260;
	st.local.f32 	[%rd425], %f261;
	mul.ftz.f32 	%f262, %f32, %f258;
	sub.ftz.f32 	%f263, %f257, %f262;
	ld.local.f32 	%f264, [%rd424];
	fma.rn.ftz.f32 	%f265, %f31, %f263, %f264;
	st.local.f32 	[%rd424], %f265;
	add.s64 	%rd425, %rd425, 4;
	add.s64 	%rd424, %rd424, 4;
	add.s32 	%r777, %r777, 1;
	setp.lt.s32	%p54, %r777, %r776;
	@%p54 bra 	BB8_74;

BB8_75:
	add.s32 	%r779, %r776, 1;
	setp.ge.s32	%p55, %r779, %r775;
	@%p55 bra 	BB8_78;

	add.s32 	%r580, %r776, 1;
	mad.lo.s32 	%r778, %r14, %r580, %r776;
	add.s32 	%r581, %r84, %r776;
	mul.wide.s32 	%rd245, %r581, 4;
	add.s64 	%rd426, %rd47, %rd245;

BB8_77:
	mul.wide.s32 	%rd246, %r778, 4;
	add.s64 	%rd247, %rd5, %rd246;
	ld.local.f32 	%f266, [%rd247];
	ld.local.f32 	%f267, [%rd426];
	fma.rn.ftz.f32 	%f268, %f32, %f266, %f267;
	mul.ftz.f32 	%f269, %f31, %f268;
	sub.ftz.f32 	%f270, %f266, %f269;
	st.local.f32 	[%rd247], %f270;
	mul.ftz.f32 	%f271, %f32, %f267;
	sub.ftz.f32 	%f272, %f266, %f271;
	ld.local.f32 	%f273, [%rd426];
	fma.rn.ftz.f32 	%f274, %f31, %f272, %f273;
	st.local.f32 	[%rd426], %f274;
	add.s32 	%r778, %r778, %r14;
	add.s64 	%rd426, %rd426, 4;
	add.s32 	%r779, %r779, 1;
	setp.lt.s32	%p56, %r779, %r775;
	@%p56 bra 	BB8_77;

BB8_78:
	setp.ge.s32	%p57, %r574, %r14;
	@%p57 bra 	BB8_81;

	add.s32 	%r780, %r85, %r776;
	mov.u32 	%r781, %r86;
	mov.u32 	%r782, %r574;

BB8_80:
	mul.wide.s32 	%rd248, %r780, 4;
	add.s64 	%rd249, %rd5, %rd248;
	mul.wide.s32 	%rd250, %r781, 4;
	add.s64 	%rd251, %rd5, %rd250;
	ld.local.f32 	%f275, [%rd249];
	ld.local.f32 	%f276, [%rd251];
	fma.rn.ftz.f32 	%f277, %f32, %f275, %f276;
	mul.ftz.f32 	%f278, %f31, %f277;
	sub.ftz.f32 	%f279, %f275, %f278;
	st.local.f32 	[%rd249], %f279;
	mul.ftz.f32 	%f280, %f32, %f276;
	sub.ftz.f32 	%f281, %f275, %f280;
	ld.local.f32 	%f282, [%rd251];
	fma.rn.ftz.f32 	%f283, %f31, %f281, %f282;
	st.local.f32 	[%rd251], %f283;
	add.s32 	%r781, %r781, %r14;
	add.s32 	%r780, %r780, %r14;
	add.s32 	%r782, %r782, 1;
	setp.lt.s32	%p58, %r782, %r14;
	@%p58 bra 	BB8_80;

BB8_81:
	mul.lo.s32 	%r784, %r89, %r9;
	mov.u32 	%r783, %r87;

BB8_82:
	add.s32 	%r584, %r784, %r8;
	mul.wide.s32 	%rd252, %r584, 4;
	add.s64 	%rd253, %rd1, %rd252;
	add.s32 	%r585, %r783, %r8;
	mul.wide.s32 	%rd254, %r585, 4;
	add.s64 	%rd255, %rd1, %rd254;
	ld.global.f32 	%f284, [%rd253];
	ld.global.f32 	%f285, [%rd255];
	fma.rn.ftz.f32 	%f286, %f32, %f284, %f285;
	mul.ftz.f32 	%f287, %f31, %f286;
	sub.ftz.f32 	%f288, %f284, %f287;
	st.global.f32 	[%rd253], %f288;
	mul.ftz.f32 	%f289, %f32, %f285;
	sub.ftz.f32 	%f290, %f284, %f289;
	ld.global.f32 	%f291, [%rd255];
	fma.rn.ftz.f32 	%f292, %f31, %f290, %f291;
	st.global.f32 	[%rd255], %f292;
	add.s32 	%r784, %r784, %r9;
	add.s32 	%r783, %r783, %r9;
	add.s32 	%r785, %r785, 1;
	setp.lt.s32	%p59, %r785, %r14;
	@%p59 bra 	BB8_82;

BB8_83:
	add.s32 	%r776, %r776, 1;
	setp.lt.s32	%p60, %r776, %r775;
	@%p60 bra 	BB8_68;

BB8_84:
	add.s32 	%r775, %r775, 1;
	mov.u32 	%r786, 1;
	setp.lt.s32	%p61, %r775, %r14;
	mov.f32 	%f737, 0f00000000;
	@%p61 bra 	BB8_66;

BB8_85:
	setp.lt.s32	%p62, %r786, 1;
	@%p62 bra 	BB8_88;

	mul.lo.s32 	%r588, %r14, %r786;
	mul.wide.s32 	%rd256, %r588, 4;
	add.s64 	%rd427, %rd5, %rd256;
	mov.u32 	%r787, 0;

BB8_87:
	ld.local.f32 	%f294, [%rd427];
	abs.ftz.f32 	%f295, %f294;
	add.ftz.f32 	%f737, %f737, %f295;
	add.s64 	%rd427, %rd427, 4;
	add.s32 	%r787, %r787, 1;
	setp.lt.s32	%p63, %r787, %r786;
	@%p63 bra 	BB8_87;

BB8_88:
	add.s32 	%r786, %r786, 1;
	setp.lt.s32	%p64, %r786, %r14;
	@%p64 bra 	BB8_85;

	setp.lt.ftz.f32	%p65, %f737, 0f33D6BF95;
	@%p65 bra 	BB8_272;

	mul.ftz.f32 	%f296, %f737, 0f3E4CCCCD;
	div.approx.ftz.f32 	%f37, %f296, %f18;
	mov.u32 	%r788, 1;

BB8_91:
	setp.lt.s32	%p66, %r788, 1;
	@%p66 bra 	BB8_109;

	mul.lo.s32 	%r121, %r788, %r14;
	add.s32 	%r591, %r121, %r788;
	mul.wide.s32 	%rd257, %r591, 4;
	add.s64 	%rd65, %rd5, %rd257;
	mul.wide.s32 	%rd258, %r121, 4;
	add.s64 	%rd66, %rd5, %rd258;
	add.s32 	%r592, %r788, 1;
	mul.lo.s32 	%r122, %r14, %r592;
	add.s32 	%r123, %r788, %r122;
	mul.lo.s32 	%r124, %r121, %r9;
	mov.u32 	%r789, 0;

BB8_93:
	add.s32 	%r593, %r789, %r121;
	mul.wide.s32 	%rd259, %r593, 4;
	add.s64 	%rd67, %rd5, %rd259;
	ld.local.f32 	%f38, [%rd67];
	abs.ftz.f32 	%f39, %f38;
	setp.eq.ftz.f32	%p67, %f38, 0f00000000;
	setp.lt.ftz.f32	%p68, %f39, %f37;
	or.pred  	%p69, %p67, %p68;
	@%p69 bra 	BB8_108;

	ld.local.f32 	%f40, [%rd65];
	mul.lo.s32 	%r126, %r789, %r14;
	add.s32 	%r594, %r126, %r789;
	mul.wide.s32 	%rd260, %r594, 4;
	add.s64 	%rd68, %rd5, %rd260;
	ld.local.f32 	%f297, [%rd68];
	sub.ftz.f32 	%f41, %f40, %f297;
	abs.ftz.f32 	%f298, %f41;
	mul.ftz.f32 	%f299, %f298, 0f3089705F;
	setp.gt.ftz.f32	%p70, %f39, %f299;
	@%p70 bra 	BB8_96;
	bra.uni 	BB8_95;

BB8_96:
	mul.ftz.f32 	%f300, %f41, 0f3F000000;
	div.approx.ftz.f32 	%f301, %f300, %f38;
	abs.ftz.f32 	%f302, %f301;
	fma.rn.ftz.f32 	%f303, %f301, %f301, 0f3F800000;
	sqrt.approx.ftz.f32 	%f304, %f303;
	add.ftz.f32 	%f305, %f302, %f304;
	rcp.approx.ftz.f32 	%f306, %f305;
	setp.lt.ftz.f32	%p71, %f301, 0f00000000;
	neg.ftz.f32 	%f307, %f306;
	selp.f32	%f738, %f307, %f306, %p71;
	bra.uni 	BB8_97;

BB8_95:
	div.approx.ftz.f32 	%f738, %f38, %f41;

BB8_97:
	fma.rn.ftz.f32 	%f308, %f738, %f738, 0f3F800000;
	rsqrt.approx.ftz.f32 	%f309, %f308;
	mul.ftz.f32 	%f45, %f738, %f309;
	add.ftz.f32 	%f310, %f309, 0f3F800000;
	div.approx.ftz.f32 	%f46, %f45, %f310;
	mul.ftz.f32 	%f311, %f38, %f738;
	add.ftz.f32 	%f312, %f311, %f40;
	st.local.f32 	[%rd65], %f312;
	ld.local.f32 	%f313, [%rd68];
	sub.ftz.f32 	%f314, %f313, %f311;
	st.local.f32 	[%rd68], %f314;
	mov.u32 	%r798, 0;
	st.local.u32 	[%rd67], %r798;
	setp.lt.s32	%p72, %r789, 1;
	@%p72 bra 	BB8_100;

	mul.lo.s32 	%r597, %r14, %r789;
	mul.wide.s32 	%rd261, %r597, 4;
	add.s64 	%rd428, %rd5, %rd261;
	mov.u32 	%r790, 0;
	mov.u64 	%rd429, %rd66;

BB8_99:
	ld.local.f32 	%f315, [%rd428];
	ld.local.f32 	%f316, [%rd429];
	fma.rn.ftz.f32 	%f317, %f46, %f315, %f316;
	mul.ftz.f32 	%f318, %f45, %f317;
	sub.ftz.f32 	%f319, %f315, %f318;
	st.local.f32 	[%rd428], %f319;
	mul.ftz.f32 	%f320, %f46, %f316;
	sub.ftz.f32 	%f321, %f315, %f320;
	ld.local.f32 	%f322, [%rd429];
	fma.rn.ftz.f32 	%f323, %f45, %f321, %f322;
	st.local.f32 	[%rd429], %f323;
	add.s64 	%rd429, %rd429, 4;
	add.s64 	%rd428, %rd428, 4;
	add.s32 	%r790, %r790, 1;
	setp.lt.s32	%p73, %r790, %r789;
	@%p73 bra 	BB8_99;

BB8_100:
	add.s32 	%r792, %r789, 1;
	setp.ge.s32	%p74, %r792, %r788;
	@%p74 bra 	BB8_103;

	add.s32 	%r598, %r121, %r789;
	mul.wide.s32 	%rd262, %r598, 4;
	add.s64 	%rd430, %rd47, %rd262;
	add.s32 	%r599, %r789, 1;
	mad.lo.s32 	%r791, %r14, %r599, %r789;

BB8_102:
	mul.wide.s32 	%rd263, %r791, 4;
	add.s64 	%rd264, %rd5, %rd263;
	ld.local.f32 	%f324, [%rd264];
	ld.local.f32 	%f325, [%rd430];
	fma.rn.ftz.f32 	%f326, %f46, %f324, %f325;
	mul.ftz.f32 	%f327, %f45, %f326;
	sub.ftz.f32 	%f328, %f324, %f327;
	st.local.f32 	[%rd264], %f328;
	mul.ftz.f32 	%f329, %f46, %f325;
	sub.ftz.f32 	%f330, %f324, %f329;
	ld.local.f32 	%f331, [%rd430];
	fma.rn.ftz.f32 	%f332, %f45, %f330, %f331;
	st.local.f32 	[%rd430], %f332;
	add.s64 	%rd430, %rd430, 4;
	add.s32 	%r791, %r791, %r14;
	add.s32 	%r792, %r792, 1;
	setp.lt.s32	%p75, %r792, %r788;
	@%p75 bra 	BB8_102;

BB8_103:
	setp.ge.s32	%p76, %r592, %r14;
	@%p76 bra 	BB8_106;

	add.s32 	%r793, %r122, %r789;
	mov.u32 	%r794, %r123;
	mov.u32 	%r795, %r592;

BB8_105:
	mul.wide.s32 	%rd265, %r793, 4;
	add.s64 	%rd266, %rd5, %rd265;
	mul.wide.s32 	%rd267, %r794, 4;
	add.s64 	%rd268, %rd5, %rd267;
	ld.local.f32 	%f333, [%rd266];
	ld.local.f32 	%f334, [%rd268];
	fma.rn.ftz.f32 	%f335, %f46, %f333, %f334;
	mul.ftz.f32 	%f336, %f45, %f335;
	sub.ftz.f32 	%f337, %f333, %f336;
	st.local.f32 	[%rd266], %f337;
	mul.ftz.f32 	%f338, %f46, %f334;
	sub.ftz.f32 	%f339, %f333, %f338;
	ld.local.f32 	%f340, [%rd268];
	fma.rn.ftz.f32 	%f341, %f45, %f339, %f340;
	st.local.f32 	[%rd268], %f341;
	add.s32 	%r794, %r794, %r14;
	add.s32 	%r793, %r793, %r14;
	add.s32 	%r795, %r795, 1;
	setp.lt.s32	%p77, %r795, %r14;
	@%p77 bra 	BB8_105;

BB8_106:
	mul.lo.s32 	%r796, %r126, %r9;
	mov.u32 	%r797, %r124;

BB8_107:
	add.s32 	%r602, %r796, %r8;
	mul.wide.s32 	%rd269, %r602, 4;
	add.s64 	%rd270, %rd1, %rd269;
	add.s32 	%r603, %r797, %r8;
	mul.wide.s32 	%rd271, %r603, 4;
	add.s64 	%rd272, %rd1, %rd271;
	ld.global.f32 	%f342, [%rd270];
	ld.global.f32 	%f343, [%rd272];
	fma.rn.ftz.f32 	%f344, %f46, %f342, %f343;
	mul.ftz.f32 	%f345, %f45, %f344;
	sub.ftz.f32 	%f346, %f342, %f345;
	st.global.f32 	[%rd270], %f346;
	mul.ftz.f32 	%f347, %f46, %f343;
	sub.ftz.f32 	%f348, %f342, %f347;
	ld.global.f32 	%f349, [%rd272];
	fma.rn.ftz.f32 	%f350, %f45, %f348, %f349;
	st.global.f32 	[%rd272], %f350;
	add.s32 	%r797, %r797, %r9;
	add.s32 	%r796, %r796, %r9;
	add.s32 	%r798, %r798, 1;
	setp.lt.s32	%p78, %r798, %r14;
	@%p78 bra 	BB8_107;

BB8_108:
	add.s32 	%r789, %r789, 1;
	setp.lt.s32	%p79, %r789, %r788;
	@%p79 bra 	BB8_93;

BB8_109:
	add.s32 	%r788, %r788, 1;
	mov.u32 	%r799, 1;
	setp.lt.s32	%p80, %r788, %r14;
	mov.f32 	%f741, 0f00000000;
	@%p80 bra 	BB8_91;

BB8_110:
	setp.lt.s32	%p81, %r799, 1;
	@%p81 bra 	BB8_113;

	mul.lo.s32 	%r606, %r14, %r799;
	mul.wide.s32 	%rd273, %r606, 4;
	add.s64 	%rd431, %rd5, %rd273;
	mov.u32 	%r800, 0;

BB8_112:
	ld.local.f32 	%f352, [%rd431];
	abs.ftz.f32 	%f353, %f352;
	add.ftz.f32 	%f741, %f741, %f353;
	add.s64 	%rd431, %rd431, 4;
	add.s32 	%r800, %r800, 1;
	setp.lt.s32	%p82, %r800, %r799;
	@%p82 bra 	BB8_112;

BB8_113:
	add.s32 	%r799, %r799, 1;
	setp.lt.s32	%p83, %r799, %r14;
	@%p83 bra 	BB8_110;

	setp.lt.ftz.f32	%p84, %f741, 0f33D6BF95;
	@%p84 bra 	BB8_272;

	mul.ftz.f32 	%f354, %f741, 0f3E4CCCCD;
	div.approx.ftz.f32 	%f51, %f354, %f18;
	mov.u32 	%r801, 1;

BB8_116:
	setp.lt.s32	%p85, %r801, 1;
	@%p85 bra 	BB8_134;

	mul.lo.s32 	%r158, %r801, %r14;
	add.s32 	%r609, %r158, %r801;
	mul.wide.s32 	%rd274, %r609, 4;
	add.s64 	%rd82, %rd5, %rd274;
	mul.wide.s32 	%rd275, %r158, 4;
	add.s64 	%rd83, %rd5, %rd275;
	add.s32 	%r610, %r801, 1;
	mul.lo.s32 	%r159, %r14, %r610;
	add.s32 	%r160, %r801, %r159;
	mul.lo.s32 	%r161, %r158, %r9;
	mov.u32 	%r802, 0;

BB8_118:
	add.s32 	%r611, %r802, %r158;
	mul.wide.s32 	%rd276, %r611, 4;
	add.s64 	%rd84, %rd5, %rd276;
	ld.local.f32 	%f52, [%rd84];
	abs.ftz.f32 	%f53, %f52;
	setp.eq.ftz.f32	%p86, %f52, 0f00000000;
	setp.lt.ftz.f32	%p87, %f53, %f51;
	or.pred  	%p88, %p86, %p87;
	@%p88 bra 	BB8_133;

	ld.local.f32 	%f54, [%rd82];
	mul.lo.s32 	%r163, %r802, %r14;
	add.s32 	%r612, %r163, %r802;
	mul.wide.s32 	%rd277, %r612, 4;
	add.s64 	%rd85, %rd5, %rd277;
	ld.local.f32 	%f355, [%rd85];
	sub.ftz.f32 	%f55, %f54, %f355;
	abs.ftz.f32 	%f356, %f55;
	mul.ftz.f32 	%f357, %f356, 0f3089705F;
	setp.gt.ftz.f32	%p89, %f53, %f357;
	@%p89 bra 	BB8_121;
	bra.uni 	BB8_120;

BB8_121:
	mul.ftz.f32 	%f358, %f55, 0f3F000000;
	div.approx.ftz.f32 	%f359, %f358, %f52;
	abs.ftz.f32 	%f360, %f359;
	fma.rn.ftz.f32 	%f361, %f359, %f359, 0f3F800000;
	sqrt.approx.ftz.f32 	%f362, %f361;
	add.ftz.f32 	%f363, %f360, %f362;
	rcp.approx.ftz.f32 	%f364, %f363;
	setp.lt.ftz.f32	%p90, %f359, 0f00000000;
	neg.ftz.f32 	%f365, %f364;
	selp.f32	%f742, %f365, %f364, %p90;
	bra.uni 	BB8_122;

BB8_120:
	div.approx.ftz.f32 	%f742, %f52, %f55;

BB8_122:
	fma.rn.ftz.f32 	%f366, %f742, %f742, 0f3F800000;
	rsqrt.approx.ftz.f32 	%f367, %f366;
	mul.ftz.f32 	%f59, %f742, %f367;
	add.ftz.f32 	%f368, %f367, 0f3F800000;
	div.approx.ftz.f32 	%f60, %f59, %f368;
	mul.ftz.f32 	%f369, %f52, %f742;
	add.ftz.f32 	%f370, %f369, %f54;
	st.local.f32 	[%rd82], %f370;
	ld.local.f32 	%f371, [%rd85];
	sub.ftz.f32 	%f372, %f371, %f369;
	st.local.f32 	[%rd85], %f372;
	mov.u32 	%r811, 0;
	st.local.u32 	[%rd84], %r811;
	setp.lt.s32	%p91, %r802, 1;
	@%p91 bra 	BB8_125;

	mul.lo.s32 	%r615, %r14, %r802;
	mul.wide.s32 	%rd278, %r615, 4;
	add.s64 	%rd432, %rd5, %rd278;
	mov.u32 	%r803, 0;
	mov.u64 	%rd433, %rd83;

BB8_124:
	ld.local.f32 	%f373, [%rd432];
	ld.local.f32 	%f374, [%rd433];
	fma.rn.ftz.f32 	%f375, %f60, %f373, %f374;
	mul.ftz.f32 	%f376, %f59, %f375;
	sub.ftz.f32 	%f377, %f373, %f376;
	st.local.f32 	[%rd432], %f377;
	mul.ftz.f32 	%f378, %f60, %f374;
	sub.ftz.f32 	%f379, %f373, %f378;
	ld.local.f32 	%f380, [%rd433];
	fma.rn.ftz.f32 	%f381, %f59, %f379, %f380;
	st.local.f32 	[%rd433], %f381;
	add.s64 	%rd433, %rd433, 4;
	add.s64 	%rd432, %rd432, 4;
	add.s32 	%r803, %r803, 1;
	setp.lt.s32	%p92, %r803, %r802;
	@%p92 bra 	BB8_124;

BB8_125:
	add.s32 	%r805, %r802, 1;
	setp.ge.s32	%p93, %r805, %r801;
	@%p93 bra 	BB8_128;

	add.s32 	%r616, %r158, %r802;
	mul.wide.s32 	%rd279, %r616, 4;
	add.s64 	%rd434, %rd47, %rd279;
	add.s32 	%r617, %r802, 1;
	mad.lo.s32 	%r804, %r14, %r617, %r802;

BB8_127:
	mul.wide.s32 	%rd280, %r804, 4;
	add.s64 	%rd281, %rd5, %rd280;
	ld.local.f32 	%f382, [%rd281];
	ld.local.f32 	%f383, [%rd434];
	fma.rn.ftz.f32 	%f384, %f60, %f382, %f383;
	mul.ftz.f32 	%f385, %f59, %f384;
	sub.ftz.f32 	%f386, %f382, %f385;
	st.local.f32 	[%rd281], %f386;
	mul.ftz.f32 	%f387, %f60, %f383;
	sub.ftz.f32 	%f388, %f382, %f387;
	ld.local.f32 	%f389, [%rd434];
	fma.rn.ftz.f32 	%f390, %f59, %f388, %f389;
	st.local.f32 	[%rd434], %f390;
	add.s64 	%rd434, %rd434, 4;
	add.s32 	%r804, %r804, %r14;
	add.s32 	%r805, %r805, 1;
	setp.lt.s32	%p94, %r805, %r801;
	@%p94 bra 	BB8_127;

BB8_128:
	setp.ge.s32	%p95, %r610, %r14;
	@%p95 bra 	BB8_131;

	add.s32 	%r806, %r159, %r802;
	mov.u32 	%r807, %r160;
	mov.u32 	%r808, %r610;

BB8_130:
	mul.wide.s32 	%rd282, %r806, 4;
	add.s64 	%rd283, %rd5, %rd282;
	mul.wide.s32 	%rd284, %r807, 4;
	add.s64 	%rd285, %rd5, %rd284;
	ld.local.f32 	%f391, [%rd283];
	ld.local.f32 	%f392, [%rd285];
	fma.rn.ftz.f32 	%f393, %f60, %f391, %f392;
	mul.ftz.f32 	%f394, %f59, %f393;
	sub.ftz.f32 	%f395, %f391, %f394;
	st.local.f32 	[%rd283], %f395;
	mul.ftz.f32 	%f396, %f60, %f392;
	sub.ftz.f32 	%f397, %f391, %f396;
	ld.local.f32 	%f398, [%rd285];
	fma.rn.ftz.f32 	%f399, %f59, %f397, %f398;
	st.local.f32 	[%rd285], %f399;
	add.s32 	%r807, %r807, %r14;
	add.s32 	%r806, %r806, %r14;
	add.s32 	%r808, %r808, 1;
	setp.lt.s32	%p96, %r808, %r14;
	@%p96 bra 	BB8_130;

BB8_131:
	mul.lo.s32 	%r809, %r163, %r9;
	mov.u32 	%r810, %r161;

BB8_132:
	add.s32 	%r620, %r809, %r8;
	mul.wide.s32 	%rd286, %r620, 4;
	add.s64 	%rd287, %rd1, %rd286;
	add.s32 	%r621, %r810, %r8;
	mul.wide.s32 	%rd288, %r621, 4;
	add.s64 	%rd289, %rd1, %rd288;
	ld.global.f32 	%f400, [%rd287];
	ld.global.f32 	%f401, [%rd289];
	fma.rn.ftz.f32 	%f402, %f60, %f400, %f401;
	mul.ftz.f32 	%f403, %f59, %f402;
	sub.ftz.f32 	%f404, %f400, %f403;
	st.global.f32 	[%rd287], %f404;
	mul.ftz.f32 	%f405, %f60, %f401;
	sub.ftz.f32 	%f406, %f400, %f405;
	ld.global.f32 	%f407, [%rd289];
	fma.rn.ftz.f32 	%f408, %f59, %f406, %f407;
	st.global.f32 	[%rd289], %f408;
	add.s32 	%r810, %r810, %r9;
	add.s32 	%r809, %r809, %r9;
	add.s32 	%r811, %r811, 1;
	setp.lt.s32	%p97, %r811, %r14;
	@%p97 bra 	BB8_132;

BB8_133:
	add.s32 	%r802, %r802, 1;
	setp.lt.s32	%p98, %r802, %r801;
	@%p98 bra 	BB8_118;

BB8_134:
	add.s32 	%r801, %r801, 1;
	mov.u32 	%r812, 1;
	setp.lt.s32	%p99, %r801, %r14;
	mov.f32 	%f745, 0f00000000;
	@%p99 bra 	BB8_116;

BB8_135:
	setp.lt.s32	%p100, %r812, 1;
	@%p100 bra 	BB8_138;

	mul.lo.s32 	%r624, %r14, %r812;
	mul.wide.s32 	%rd290, %r624, 4;
	add.s64 	%rd435, %rd5, %rd290;
	mov.u32 	%r813, 0;

BB8_137:
	ld.local.f32 	%f410, [%rd435];
	abs.ftz.f32 	%f411, %f410;
	add.ftz.f32 	%f745, %f745, %f411;
	add.s64 	%rd435, %rd435, 4;
	add.s32 	%r813, %r813, 1;
	setp.lt.s32	%p101, %r813, %r812;
	@%p101 bra 	BB8_137;

BB8_138:
	add.s32 	%r812, %r812, 1;
	setp.lt.s32	%p102, %r812, %r14;
	@%p102 bra 	BB8_135;

	setp.lt.ftz.f32	%p103, %f745, 0f33D6BF95;
	@%p103 bra 	BB8_272;

	mov.u32 	%r814, 1;

BB8_141:
	setp.lt.s32	%p104, %r814, 1;
	@%p104 bra 	BB8_159;

	mul.lo.s32 	%r195, %r814, %r14;
	add.s32 	%r627, %r195, %r814;
	mul.wide.s32 	%rd291, %r627, 4;
	add.s64 	%rd99, %rd5, %rd291;
	mul.wide.s32 	%rd292, %r195, 4;
	add.s64 	%rd100, %rd5, %rd292;
	add.s32 	%r628, %r814, 1;
	mul.lo.s32 	%r196, %r14, %r628;
	add.s32 	%r197, %r814, %r196;
	mul.lo.s32 	%r198, %r195, %r9;
	mov.u32 	%r815, 0;

BB8_143:
	add.s32 	%r629, %r815, %r195;
	mul.wide.s32 	%rd293, %r629, 4;
	add.s64 	%rd101, %rd5, %rd293;
	ld.local.f32 	%f65, [%rd101];
	setp.eq.ftz.f32	%p105, %f65, 0f00000000;
	@%p105 bra 	BB8_158;

	abs.ftz.f32 	%f412, %f65;
	mul.lo.s32 	%r200, %r815, %r14;
	add.s32 	%r630, %r200, %r815;
	mul.wide.s32 	%rd294, %r630, 4;
	add.s64 	%rd102, %rd5, %rd294;
	ld.local.f32 	%f413, [%rd102];
	ld.local.f32 	%f66, [%rd99];
	sub.ftz.f32 	%f67, %f66, %f413;
	abs.ftz.f32 	%f414, %f67;
	mul.ftz.f32 	%f415, %f414, 0f3089705F;
	setp.gt.ftz.f32	%p106, %f412, %f415;
	@%p106 bra 	BB8_146;
	bra.uni 	BB8_145;

BB8_146:
	mul.ftz.f32 	%f416, %f67, 0f3F000000;
	div.approx.ftz.f32 	%f417, %f416, %f65;
	abs.ftz.f32 	%f418, %f417;
	fma.rn.ftz.f32 	%f419, %f417, %f417, 0f3F800000;
	sqrt.approx.ftz.f32 	%f420, %f419;
	add.ftz.f32 	%f421, %f418, %f420;
	rcp.approx.ftz.f32 	%f422, %f421;
	setp.lt.ftz.f32	%p107, %f417, 0f00000000;
	neg.ftz.f32 	%f423, %f422;
	selp.f32	%f746, %f423, %f422, %p107;
	bra.uni 	BB8_147;

BB8_145:
	div.approx.ftz.f32 	%f746, %f65, %f67;

BB8_147:
	fma.rn.ftz.f32 	%f424, %f746, %f746, 0f3F800000;
	rsqrt.approx.ftz.f32 	%f425, %f424;
	mul.ftz.f32 	%f71, %f746, %f425;
	add.ftz.f32 	%f426, %f425, 0f3F800000;
	div.approx.ftz.f32 	%f72, %f71, %f426;
	mul.ftz.f32 	%f427, %f65, %f746;
	add.ftz.f32 	%f428, %f427, %f66;
	st.local.f32 	[%rd99], %f428;
	ld.local.f32 	%f429, [%rd102];
	sub.ftz.f32 	%f430, %f429, %f427;
	st.local.f32 	[%rd102], %f430;
	mov.u32 	%r824, 0;
	st.local.u32 	[%rd101], %r824;
	setp.lt.s32	%p108, %r815, 1;
	@%p108 bra 	BB8_150;

	mul.lo.s32 	%r633, %r14, %r815;
	mul.wide.s32 	%rd295, %r633, 4;
	add.s64 	%rd436, %rd5, %rd295;
	mov.u32 	%r816, 0;
	mov.u64 	%rd437, %rd100;

BB8_149:
	ld.local.f32 	%f431, [%rd436];
	ld.local.f32 	%f432, [%rd437];
	fma.rn.ftz.f32 	%f433, %f72, %f431, %f432;
	mul.ftz.f32 	%f434, %f71, %f433;
	sub.ftz.f32 	%f435, %f431, %f434;
	st.local.f32 	[%rd436], %f435;
	mul.ftz.f32 	%f436, %f72, %f432;
	sub.ftz.f32 	%f437, %f431, %f436;
	ld.local.f32 	%f438, [%rd437];
	fma.rn.ftz.f32 	%f439, %f71, %f437, %f438;
	st.local.f32 	[%rd437], %f439;
	add.s64 	%rd437, %rd437, 4;
	add.s64 	%rd436, %rd436, 4;
	add.s32 	%r816, %r816, 1;
	setp.lt.s32	%p109, %r816, %r815;
	@%p109 bra 	BB8_149;

BB8_150:
	add.s32 	%r818, %r815, 1;
	setp.ge.s32	%p110, %r818, %r814;
	@%p110 bra 	BB8_153;

	add.s32 	%r634, %r195, %r815;
	mul.wide.s32 	%rd296, %r634, 4;
	add.s64 	%rd438, %rd47, %rd296;
	add.s32 	%r635, %r815, 1;
	mad.lo.s32 	%r817, %r14, %r635, %r815;

BB8_152:
	mul.wide.s32 	%rd297, %r817, 4;
	add.s64 	%rd298, %rd5, %rd297;
	ld.local.f32 	%f440, [%rd298];
	ld.local.f32 	%f441, [%rd438];
	fma.rn.ftz.f32 	%f442, %f72, %f440, %f441;
	mul.ftz.f32 	%f443, %f71, %f442;
	sub.ftz.f32 	%f444, %f440, %f443;
	st.local.f32 	[%rd298], %f444;
	mul.ftz.f32 	%f445, %f72, %f441;
	sub.ftz.f32 	%f446, %f440, %f445;
	ld.local.f32 	%f447, [%rd438];
	fma.rn.ftz.f32 	%f448, %f71, %f446, %f447;
	st.local.f32 	[%rd438], %f448;
	add.s64 	%rd438, %rd438, 4;
	add.s32 	%r817, %r817, %r14;
	add.s32 	%r818, %r818, 1;
	setp.lt.s32	%p111, %r818, %r814;
	@%p111 bra 	BB8_152;

BB8_153:
	setp.ge.s32	%p112, %r628, %r14;
	@%p112 bra 	BB8_156;

	add.s32 	%r819, %r196, %r815;
	mov.u32 	%r820, %r197;
	mov.u32 	%r821, %r628;

BB8_155:
	mul.wide.s32 	%rd299, %r819, 4;
	add.s64 	%rd300, %rd5, %rd299;
	mul.wide.s32 	%rd301, %r820, 4;
	add.s64 	%rd302, %rd5, %rd301;
	ld.local.f32 	%f449, [%rd300];
	ld.local.f32 	%f450, [%rd302];
	fma.rn.ftz.f32 	%f451, %f72, %f449, %f450;
	mul.ftz.f32 	%f452, %f71, %f451;
	sub.ftz.f32 	%f453, %f449, %f452;
	st.local.f32 	[%rd300], %f453;
	mul.ftz.f32 	%f454, %f72, %f450;
	sub.ftz.f32 	%f455, %f449, %f454;
	ld.local.f32 	%f456, [%rd302];
	fma.rn.ftz.f32 	%f457, %f71, %f455, %f456;
	st.local.f32 	[%rd302], %f457;
	add.s32 	%r820, %r820, %r14;
	add.s32 	%r819, %r819, %r14;
	add.s32 	%r821, %r821, 1;
	setp.lt.s32	%p113, %r821, %r14;
	@%p113 bra 	BB8_155;

BB8_156:
	mul.lo.s32 	%r822, %r200, %r9;
	mov.u32 	%r823, %r198;

BB8_157:
	add.s32 	%r638, %r822, %r8;
	mul.wide.s32 	%rd303, %r638, 4;
	add.s64 	%rd304, %rd1, %rd303;
	add.s32 	%r639, %r823, %r8;
	mul.wide.s32 	%rd305, %r639, 4;
	add.s64 	%rd306, %rd1, %rd305;
	ld.global.f32 	%f458, [%rd304];
	ld.global.f32 	%f459, [%rd306];
	fma.rn.ftz.f32 	%f460, %f72, %f458, %f459;
	mul.ftz.f32 	%f461, %f71, %f460;
	sub.ftz.f32 	%f462, %f458, %f461;
	st.global.f32 	[%rd304], %f462;
	mul.ftz.f32 	%f463, %f72, %f459;
	sub.ftz.f32 	%f464, %f458, %f463;
	ld.global.f32 	%f465, [%rd306];
	fma.rn.ftz.f32 	%f466, %f71, %f464, %f465;
	st.global.f32 	[%rd306], %f466;
	add.s32 	%r823, %r823, %r9;
	add.s32 	%r822, %r822, %r9;
	add.s32 	%r824, %r824, 1;
	setp.lt.s32	%p114, %r824, %r14;
	@%p114 bra 	BB8_157;

BB8_158:
	add.s32 	%r815, %r815, 1;
	setp.lt.s32	%p115, %r815, %r814;
	@%p115 bra 	BB8_143;

BB8_159:
	add.s32 	%r814, %r814, 1;
	mov.u32 	%r825, 1;
	setp.lt.s32	%p116, %r814, %r14;
	mov.f32 	%f749, 0f00000000;
	@%p116 bra 	BB8_141;

BB8_160:
	setp.lt.s32	%p117, %r825, 1;
	@%p117 bra 	BB8_163;

	mul.lo.s32 	%r642, %r14, %r825;
	mul.wide.s32 	%rd307, %r642, 4;
	add.s64 	%rd439, %rd5, %rd307;
	mov.u32 	%r826, 0;

BB8_162:
	ld.local.f32 	%f468, [%rd439];
	abs.ftz.f32 	%f469, %f468;
	add.ftz.f32 	%f749, %f749, %f469;
	add.s64 	%rd439, %rd439, 4;
	add.s32 	%r826, %r826, 1;
	setp.lt.s32	%p118, %r826, %r825;
	@%p118 bra 	BB8_162;

BB8_163:
	add.s32 	%r825, %r825, 1;
	setp.lt.s32	%p119, %r825, %r14;
	@%p119 bra 	BB8_160;

	setp.lt.ftz.f32	%p120, %f749, 0f33D6BF95;
	@%p120 bra 	BB8_272;

	mov.u32 	%r827, 1;

BB8_166:
	setp.lt.s32	%p121, %r827, 1;
	@%p121 bra 	BB8_187;

	mul.lo.s32 	%r232, %r827, %r14;
	add.s32 	%r645, %r232, %r827;
	mul.wide.s32 	%rd308, %r645, 4;
	add.s64 	%rd116, %rd5, %rd308;
	mul.wide.s32 	%rd309, %r232, 4;
	add.s64 	%rd117, %rd5, %rd309;
	add.s32 	%r646, %r827, 1;
	mul.lo.s32 	%r233, %r14, %r646;
	add.s32 	%r234, %r827, %r233;
	mul.lo.s32 	%r235, %r232, %r9;
	mov.u32 	%r828, 0;

BB8_168:
	add.s32 	%r647, %r828, %r232;
	mul.wide.s32 	%rd310, %r647, 4;
	add.s64 	%rd118, %rd5, %rd310;
	ld.local.f32 	%f77, [%rd118];
	abs.ftz.f32 	%f78, %f77;
	ld.local.f32 	%f79, [%rd116];
	abs.ftz.f32 	%f470, %f79;
	mul.ftz.f32 	%f471, %f470, 0f3089705F;
	setp.gtu.ftz.f32	%p122, %f78, %f471;
	mul.lo.s32 	%r237, %r828, %r14;
	add.s32 	%r648, %r237, %r828;
	mul.wide.s32 	%rd311, %r648, 4;
	add.s64 	%rd119, %rd5, %rd311;
	@%p122 bra 	BB8_171;

	ld.local.f32 	%f472, [%rd119];
	abs.ftz.f32 	%f473, %f472;
	mul.ftz.f32 	%f474, %f473, 0f3089705F;
	setp.gtu.ftz.f32	%p123, %f78, %f474;
	@%p123 bra 	BB8_171;
	bra.uni 	BB8_170;

BB8_171:
	setp.eq.ftz.f32	%p124, %f77, 0f00000000;
	@%p124 bra 	BB8_186;

	ld.local.f32 	%f475, [%rd119];
	sub.ftz.f32 	%f80, %f79, %f475;
	abs.ftz.f32 	%f476, %f80;
	mul.ftz.f32 	%f477, %f476, 0f3089705F;
	setp.gt.ftz.f32	%p125, %f78, %f477;
	@%p125 bra 	BB8_174;
	bra.uni 	BB8_173;

BB8_174:
	mul.ftz.f32 	%f478, %f80, 0f3F000000;
	div.approx.ftz.f32 	%f479, %f478, %f77;
	abs.ftz.f32 	%f480, %f479;
	fma.rn.ftz.f32 	%f481, %f479, %f479, 0f3F800000;
	sqrt.approx.ftz.f32 	%f482, %f481;
	add.ftz.f32 	%f483, %f480, %f482;
	rcp.approx.ftz.f32 	%f484, %f483;
	setp.lt.ftz.f32	%p126, %f479, 0f00000000;
	neg.ftz.f32 	%f485, %f484;
	selp.f32	%f750, %f485, %f484, %p126;
	bra.uni 	BB8_175;

BB8_170:
	mov.u32 	%r649, 0;
	st.local.u32 	[%rd118], %r649;
	bra.uni 	BB8_186;

BB8_173:
	div.approx.ftz.f32 	%f750, %f77, %f80;

BB8_175:
	fma.rn.ftz.f32 	%f486, %f750, %f750, 0f3F800000;
	rsqrt.approx.ftz.f32 	%f487, %f486;
	mul.ftz.f32 	%f84, %f750, %f487;
	add.ftz.f32 	%f488, %f487, 0f3F800000;
	div.approx.ftz.f32 	%f85, %f84, %f488;
	mul.ftz.f32 	%f489, %f77, %f750;
	add.ftz.f32 	%f490, %f489, %f79;
	st.local.f32 	[%rd116], %f490;
	ld.local.f32 	%f491, [%rd119];
	sub.ftz.f32 	%f492, %f491, %f489;
	st.local.f32 	[%rd119], %f492;
	mov.u32 	%r837, 0;
	st.local.u32 	[%rd118], %r837;
	setp.lt.s32	%p127, %r828, 1;
	@%p127 bra 	BB8_178;

	mul.lo.s32 	%r652, %r14, %r828;
	mul.wide.s32 	%rd312, %r652, 4;
	add.s64 	%rd440, %rd5, %rd312;
	mov.u32 	%r829, 0;
	mov.u64 	%rd441, %rd117;

BB8_177:
	ld.local.f32 	%f493, [%rd440];
	ld.local.f32 	%f494, [%rd441];
	fma.rn.ftz.f32 	%f495, %f85, %f493, %f494;
	mul.ftz.f32 	%f496, %f84, %f495;
	sub.ftz.f32 	%f497, %f493, %f496;
	st.local.f32 	[%rd440], %f497;
	mul.ftz.f32 	%f498, %f85, %f494;
	sub.ftz.f32 	%f499, %f493, %f498;
	ld.local.f32 	%f500, [%rd441];
	fma.rn.ftz.f32 	%f501, %f84, %f499, %f500;
	st.local.f32 	[%rd441], %f501;
	add.s64 	%rd441, %rd441, 4;
	add.s64 	%rd440, %rd440, 4;
	add.s32 	%r829, %r829, 1;
	setp.lt.s32	%p128, %r829, %r828;
	@%p128 bra 	BB8_177;

BB8_178:
	add.s32 	%r831, %r828, 1;
	setp.ge.s32	%p129, %r831, %r827;
	@%p129 bra 	BB8_181;

	add.s32 	%r653, %r232, %r828;
	mul.wide.s32 	%rd313, %r653, 4;
	add.s64 	%rd442, %rd47, %rd313;
	add.s32 	%r654, %r828, 1;
	mad.lo.s32 	%r830, %r14, %r654, %r828;

BB8_180:
	mul.wide.s32 	%rd314, %r830, 4;
	add.s64 	%rd315, %rd5, %rd314;
	ld.local.f32 	%f502, [%rd315];
	ld.local.f32 	%f503, [%rd442];
	fma.rn.ftz.f32 	%f504, %f85, %f502, %f503;
	mul.ftz.f32 	%f505, %f84, %f504;
	sub.ftz.f32 	%f506, %f502, %f505;
	st.local.f32 	[%rd315], %f506;
	mul.ftz.f32 	%f507, %f85, %f503;
	sub.ftz.f32 	%f508, %f502, %f507;
	ld.local.f32 	%f509, [%rd442];
	fma.rn.ftz.f32 	%f510, %f84, %f508, %f509;
	st.local.f32 	[%rd442], %f510;
	add.s64 	%rd442, %rd442, 4;
	add.s32 	%r830, %r830, %r14;
	add.s32 	%r831, %r831, 1;
	setp.lt.s32	%p130, %r831, %r827;
	@%p130 bra 	BB8_180;

BB8_181:
	setp.ge.s32	%p131, %r646, %r14;
	@%p131 bra 	BB8_184;

	add.s32 	%r832, %r233, %r828;
	mov.u32 	%r833, %r234;
	mov.u32 	%r834, %r646;

BB8_183:
	mul.wide.s32 	%rd316, %r832, 4;
	add.s64 	%rd317, %rd5, %rd316;
	mul.wide.s32 	%rd318, %r833, 4;
	add.s64 	%rd319, %rd5, %rd318;
	ld.local.f32 	%f511, [%rd317];
	ld.local.f32 	%f512, [%rd319];
	fma.rn.ftz.f32 	%f513, %f85, %f511, %f512;
	mul.ftz.f32 	%f514, %f84, %f513;
	sub.ftz.f32 	%f515, %f511, %f514;
	st.local.f32 	[%rd317], %f515;
	mul.ftz.f32 	%f516, %f85, %f512;
	sub.ftz.f32 	%f517, %f511, %f516;
	ld.local.f32 	%f518, [%rd319];
	fma.rn.ftz.f32 	%f519, %f84, %f517, %f518;
	st.local.f32 	[%rd319], %f519;
	add.s32 	%r833, %r833, %r14;
	add.s32 	%r832, %r832, %r14;
	add.s32 	%r834, %r834, 1;
	setp.lt.s32	%p132, %r834, %r14;
	@%p132 bra 	BB8_183;

BB8_184:
	mul.lo.s32 	%r835, %r237, %r9;
	mov.u32 	%r836, %r235;

BB8_185:
	add.s32 	%r657, %r835, %r8;
	mul.wide.s32 	%rd320, %r657, 4;
	add.s64 	%rd321, %rd1, %rd320;
	add.s32 	%r658, %r836, %r8;
	mul.wide.s32 	%rd322, %r658, 4;
	add.s64 	%rd323, %rd1, %rd322;
	ld.global.f32 	%f520, [%rd321];
	ld.global.f32 	%f521, [%rd323];
	fma.rn.ftz.f32 	%f522, %f85, %f520, %f521;
	mul.ftz.f32 	%f523, %f84, %f522;
	sub.ftz.f32 	%f524, %f520, %f523;
	st.global.f32 	[%rd321], %f524;
	mul.ftz.f32 	%f525, %f85, %f521;
	sub.ftz.f32 	%f526, %f520, %f525;
	ld.global.f32 	%f527, [%rd323];
	fma.rn.ftz.f32 	%f528, %f84, %f526, %f527;
	st.global.f32 	[%rd323], %f528;
	add.s32 	%r836, %r836, %r9;
	add.s32 	%r835, %r835, %r9;
	add.s32 	%r837, %r837, 1;
	setp.lt.s32	%p133, %r837, %r14;
	@%p133 bra 	BB8_185;

BB8_186:
	add.s32 	%r828, %r828, 1;
	setp.lt.s32	%p134, %r828, %r827;
	@%p134 bra 	BB8_168;

BB8_187:
	add.s32 	%r827, %r827, 1;
	mov.u32 	%r838, 1;
	setp.lt.s32	%p135, %r827, %r14;
	mov.f32 	%f753, 0f00000000;
	@%p135 bra 	BB8_166;

BB8_188:
	setp.lt.s32	%p136, %r838, 1;
	@%p136 bra 	BB8_191;

	mul.lo.s32 	%r661, %r14, %r838;
	mul.wide.s32 	%rd324, %r661, 4;
	add.s64 	%rd443, %rd5, %rd324;
	mov.u32 	%r839, 0;

BB8_190:
	ld.local.f32 	%f530, [%rd443];
	abs.ftz.f32 	%f531, %f530;
	add.ftz.f32 	%f753, %f753, %f531;
	add.s64 	%rd443, %rd443, 4;
	add.s32 	%r839, %r839, 1;
	setp.lt.s32	%p137, %r839, %r838;
	@%p137 bra 	BB8_190;

BB8_191:
	add.s32 	%r838, %r838, 1;
	setp.lt.s32	%p138, %r838, %r14;
	@%p138 bra 	BB8_188;

	setp.lt.ftz.f32	%p139, %f753, 0f33D6BF95;
	@%p139 bra 	BB8_272;

	mov.u32 	%r840, 1;

BB8_194:
	setp.lt.s32	%p140, %r840, 1;
	@%p140 bra 	BB8_215;

	mul.lo.s32 	%r269, %r840, %r14;
	add.s32 	%r664, %r269, %r840;
	mul.wide.s32 	%rd325, %r664, 4;
	add.s64 	%rd133, %rd5, %rd325;
	mul.wide.s32 	%rd326, %r269, 4;
	add.s64 	%rd134, %rd5, %rd326;
	add.s32 	%r665, %r840, 1;
	mul.lo.s32 	%r270, %r14, %r665;
	add.s32 	%r271, %r840, %r270;
	mul.lo.s32 	%r272, %r269, %r9;
	mov.u32 	%r841, 0;

BB8_196:
	add.s32 	%r666, %r841, %r269;
	mul.wide.s32 	%rd327, %r666, 4;
	add.s64 	%rd135, %rd5, %rd327;
	ld.local.f32 	%f90, [%rd135];
	abs.ftz.f32 	%f91, %f90;
	ld.local.f32 	%f92, [%rd133];
	abs.ftz.f32 	%f532, %f92;
	mul.ftz.f32 	%f533, %f532, 0f3089705F;
	setp.gtu.ftz.f32	%p141, %f91, %f533;
	mul.lo.s32 	%r274, %r841, %r14;
	add.s32 	%r667, %r274, %r841;
	mul.wide.s32 	%rd328, %r667, 4;
	add.s64 	%rd136, %rd5, %rd328;
	@%p141 bra 	BB8_199;

	ld.local.f32 	%f534, [%rd136];
	abs.ftz.f32 	%f535, %f534;
	mul.ftz.f32 	%f536, %f535, 0f3089705F;
	setp.gtu.ftz.f32	%p142, %f91, %f536;
	@%p142 bra 	BB8_199;
	bra.uni 	BB8_198;

BB8_199:
	setp.eq.ftz.f32	%p143, %f90, 0f00000000;
	@%p143 bra 	BB8_214;

	ld.local.f32 	%f537, [%rd136];
	sub.ftz.f32 	%f93, %f92, %f537;
	abs.ftz.f32 	%f538, %f93;
	mul.ftz.f32 	%f539, %f538, 0f3089705F;
	setp.gt.ftz.f32	%p144, %f91, %f539;
	@%p144 bra 	BB8_202;
	bra.uni 	BB8_201;

BB8_202:
	mul.ftz.f32 	%f540, %f93, 0f3F000000;
	div.approx.ftz.f32 	%f541, %f540, %f90;
	abs.ftz.f32 	%f542, %f541;
	fma.rn.ftz.f32 	%f543, %f541, %f541, 0f3F800000;
	sqrt.approx.ftz.f32 	%f544, %f543;
	add.ftz.f32 	%f545, %f542, %f544;
	rcp.approx.ftz.f32 	%f546, %f545;
	setp.lt.ftz.f32	%p145, %f541, 0f00000000;
	neg.ftz.f32 	%f547, %f546;
	selp.f32	%f754, %f547, %f546, %p145;
	bra.uni 	BB8_203;

BB8_198:
	mov.u32 	%r668, 0;
	st.local.u32 	[%rd135], %r668;
	bra.uni 	BB8_214;

BB8_201:
	div.approx.ftz.f32 	%f754, %f90, %f93;

BB8_203:
	fma.rn.ftz.f32 	%f548, %f754, %f754, 0f3F800000;
	rsqrt.approx.ftz.f32 	%f549, %f548;
	mul.ftz.f32 	%f97, %f754, %f549;
	add.ftz.f32 	%f550, %f549, 0f3F800000;
	div.approx.ftz.f32 	%f98, %f97, %f550;
	mul.ftz.f32 	%f551, %f90, %f754;
	add.ftz.f32 	%f552, %f551, %f92;
	st.local.f32 	[%rd133], %f552;
	ld.local.f32 	%f553, [%rd136];
	sub.ftz.f32 	%f554, %f553, %f551;
	st.local.f32 	[%rd136], %f554;
	mov.u32 	%r850, 0;
	st.local.u32 	[%rd135], %r850;
	setp.lt.s32	%p146, %r841, 1;
	@%p146 bra 	BB8_206;

	mul.lo.s32 	%r671, %r14, %r841;
	mul.wide.s32 	%rd329, %r671, 4;
	add.s64 	%rd444, %rd5, %rd329;
	mov.u32 	%r842, 0;
	mov.u64 	%rd445, %rd134;

BB8_205:
	ld.local.f32 	%f555, [%rd444];
	ld.local.f32 	%f556, [%rd445];
	fma.rn.ftz.f32 	%f557, %f98, %f555, %f556;
	mul.ftz.f32 	%f558, %f97, %f557;
	sub.ftz.f32 	%f559, %f555, %f558;
	st.local.f32 	[%rd444], %f559;
	mul.ftz.f32 	%f560, %f98, %f556;
	sub.ftz.f32 	%f561, %f555, %f560;
	ld.local.f32 	%f562, [%rd445];
	fma.rn.ftz.f32 	%f563, %f97, %f561, %f562;
	st.local.f32 	[%rd445], %f563;
	add.s64 	%rd445, %rd445, 4;
	add.s64 	%rd444, %rd444, 4;
	add.s32 	%r842, %r842, 1;
	setp.lt.s32	%p147, %r842, %r841;
	@%p147 bra 	BB8_205;

BB8_206:
	add.s32 	%r844, %r841, 1;
	setp.ge.s32	%p148, %r844, %r840;
	@%p148 bra 	BB8_209;

	add.s32 	%r672, %r269, %r841;
	mul.wide.s32 	%rd330, %r672, 4;
	add.s64 	%rd446, %rd47, %rd330;
	add.s32 	%r673, %r841, 1;
	mad.lo.s32 	%r843, %r14, %r673, %r841;

BB8_208:
	mul.wide.s32 	%rd331, %r843, 4;
	add.s64 	%rd332, %rd5, %rd331;
	ld.local.f32 	%f564, [%rd332];
	ld.local.f32 	%f565, [%rd446];
	fma.rn.ftz.f32 	%f566, %f98, %f564, %f565;
	mul.ftz.f32 	%f567, %f97, %f566;
	sub.ftz.f32 	%f568, %f564, %f567;
	st.local.f32 	[%rd332], %f568;
	mul.ftz.f32 	%f569, %f98, %f565;
	sub.ftz.f32 	%f570, %f564, %f569;
	ld.local.f32 	%f571, [%rd446];
	fma.rn.ftz.f32 	%f572, %f97, %f570, %f571;
	st.local.f32 	[%rd446], %f572;
	add.s64 	%rd446, %rd446, 4;
	add.s32 	%r843, %r843, %r14;
	add.s32 	%r844, %r844, 1;
	setp.lt.s32	%p149, %r844, %r840;
	@%p149 bra 	BB8_208;

BB8_209:
	setp.ge.s32	%p150, %r665, %r14;
	@%p150 bra 	BB8_212;

	add.s32 	%r845, %r270, %r841;
	mov.u32 	%r846, %r271;
	mov.u32 	%r847, %r665;

BB8_211:
	mul.wide.s32 	%rd333, %r845, 4;
	add.s64 	%rd334, %rd5, %rd333;
	mul.wide.s32 	%rd335, %r846, 4;
	add.s64 	%rd336, %rd5, %rd335;
	ld.local.f32 	%f573, [%rd334];
	ld.local.f32 	%f574, [%rd336];
	fma.rn.ftz.f32 	%f575, %f98, %f573, %f574;
	mul.ftz.f32 	%f576, %f97, %f575;
	sub.ftz.f32 	%f577, %f573, %f576;
	st.local.f32 	[%rd334], %f577;
	mul.ftz.f32 	%f578, %f98, %f574;
	sub.ftz.f32 	%f579, %f573, %f578;
	ld.local.f32 	%f580, [%rd336];
	fma.rn.ftz.f32 	%f581, %f97, %f579, %f580;
	st.local.f32 	[%rd336], %f581;
	add.s32 	%r846, %r846, %r14;
	add.s32 	%r845, %r845, %r14;
	add.s32 	%r847, %r847, 1;
	setp.lt.s32	%p151, %r847, %r14;
	@%p151 bra 	BB8_211;

BB8_212:
	mul.lo.s32 	%r848, %r274, %r9;
	mov.u32 	%r849, %r272;

BB8_213:
	add.s32 	%r676, %r848, %r8;
	mul.wide.s32 	%rd337, %r676, 4;
	add.s64 	%rd338, %rd1, %rd337;
	add.s32 	%r677, %r849, %r8;
	mul.wide.s32 	%rd339, %r677, 4;
	add.s64 	%rd340, %rd1, %rd339;
	ld.global.f32 	%f582, [%rd338];
	ld.global.f32 	%f583, [%rd340];
	fma.rn.ftz.f32 	%f584, %f98, %f582, %f583;
	mul.ftz.f32 	%f585, %f97, %f584;
	sub.ftz.f32 	%f586, %f582, %f585;
	st.global.f32 	[%rd338], %f586;
	mul.ftz.f32 	%f587, %f98, %f583;
	sub.ftz.f32 	%f588, %f582, %f587;
	ld.global.f32 	%f589, [%rd340];
	fma.rn.ftz.f32 	%f590, %f97, %f588, %f589;
	st.global.f32 	[%rd340], %f590;
	add.s32 	%r849, %r849, %r9;
	add.s32 	%r848, %r848, %r9;
	add.s32 	%r850, %r850, 1;
	setp.lt.s32	%p152, %r850, %r14;
	@%p152 bra 	BB8_213;

BB8_214:
	add.s32 	%r841, %r841, 1;
	setp.lt.s32	%p153, %r841, %r840;
	@%p153 bra 	BB8_196;

BB8_215:
	add.s32 	%r840, %r840, 1;
	mov.u32 	%r851, 1;
	setp.lt.s32	%p154, %r840, %r14;
	mov.f32 	%f757, 0f00000000;
	@%p154 bra 	BB8_194;

BB8_216:
	setp.lt.s32	%p155, %r851, 1;
	@%p155 bra 	BB8_219;

	mul.lo.s32 	%r680, %r14, %r851;
	mul.wide.s32 	%rd341, %r680, 4;
	add.s64 	%rd447, %rd5, %rd341;
	mov.u32 	%r852, 0;

BB8_218:
	ld.local.f32 	%f592, [%rd447];
	abs.ftz.f32 	%f593, %f592;
	add.ftz.f32 	%f757, %f757, %f593;
	add.s64 	%rd447, %rd447, 4;
	add.s32 	%r852, %r852, 1;
	setp.lt.s32	%p156, %r852, %r851;
	@%p156 bra 	BB8_218;

BB8_219:
	add.s32 	%r851, %r851, 1;
	setp.lt.s32	%p157, %r851, %r14;
	@%p157 bra 	BB8_216;

	setp.lt.ftz.f32	%p158, %f757, 0f33D6BF95;
	@%p158 bra 	BB8_272;

	mov.u32 	%r853, 1;

BB8_222:
	setp.lt.s32	%p159, %r853, 1;
	@%p159 bra 	BB8_243;

	mul.lo.s32 	%r307, %r853, %r14;
	add.s32 	%r683, %r307, %r853;
	mul.wide.s32 	%rd342, %r683, 4;
	add.s64 	%rd150, %rd5, %rd342;
	mul.wide.s32 	%rd343, %r307, 4;
	add.s64 	%rd151, %rd5, %rd343;
	add.s32 	%r684, %r853, 1;
	mul.lo.s32 	%r308, %r14, %r684;
	add.s32 	%r309, %r853, %r308;
	mul.lo.s32 	%r310, %r307, %r9;
	mov.u32 	%r854, 0;

BB8_224:
	add.s32 	%r685, %r854, %r307;
	mul.wide.s32 	%rd344, %r685, 4;
	add.s64 	%rd152, %rd5, %rd344;
	ld.local.f32 	%f103, [%rd152];
	abs.ftz.f32 	%f104, %f103;
	ld.local.f32 	%f105, [%rd150];
	abs.ftz.f32 	%f594, %f105;
	mul.ftz.f32 	%f595, %f594, 0f3089705F;
	setp.gtu.ftz.f32	%p160, %f104, %f595;
	mul.lo.s32 	%r312, %r854, %r14;
	add.s32 	%r686, %r312, %r854;
	mul.wide.s32 	%rd345, %r686, 4;
	add.s64 	%rd153, %rd5, %rd345;
	@%p160 bra 	BB8_227;

	ld.local.f32 	%f596, [%rd153];
	abs.ftz.f32 	%f597, %f596;
	mul.ftz.f32 	%f598, %f597, 0f3089705F;
	setp.gtu.ftz.f32	%p161, %f104, %f598;
	@%p161 bra 	BB8_227;
	bra.uni 	BB8_226;

BB8_227:
	setp.eq.ftz.f32	%p162, %f103, 0f00000000;
	@%p162 bra 	BB8_242;

	ld.local.f32 	%f599, [%rd153];
	sub.ftz.f32 	%f106, %f105, %f599;
	abs.ftz.f32 	%f600, %f106;
	mul.ftz.f32 	%f601, %f600, 0f3089705F;
	setp.gt.ftz.f32	%p163, %f104, %f601;
	@%p163 bra 	BB8_230;
	bra.uni 	BB8_229;

BB8_230:
	mul.ftz.f32 	%f602, %f106, 0f3F000000;
	div.approx.ftz.f32 	%f603, %f602, %f103;
	abs.ftz.f32 	%f604, %f603;
	fma.rn.ftz.f32 	%f605, %f603, %f603, 0f3F800000;
	sqrt.approx.ftz.f32 	%f606, %f605;
	add.ftz.f32 	%f607, %f604, %f606;
	rcp.approx.ftz.f32 	%f608, %f607;
	setp.lt.ftz.f32	%p164, %f603, 0f00000000;
	neg.ftz.f32 	%f609, %f608;
	selp.f32	%f758, %f609, %f608, %p164;
	bra.uni 	BB8_231;

BB8_226:
	mov.u32 	%r687, 0;
	st.local.u32 	[%rd152], %r687;
	bra.uni 	BB8_242;

BB8_229:
	div.approx.ftz.f32 	%f758, %f103, %f106;

BB8_231:
	fma.rn.ftz.f32 	%f610, %f758, %f758, 0f3F800000;
	rsqrt.approx.ftz.f32 	%f611, %f610;
	mul.ftz.f32 	%f110, %f758, %f611;
	add.ftz.f32 	%f612, %f611, 0f3F800000;
	div.approx.ftz.f32 	%f111, %f110, %f612;
	mul.ftz.f32 	%f613, %f103, %f758;
	add.ftz.f32 	%f614, %f613, %f105;
	st.local.f32 	[%rd150], %f614;
	ld.local.f32 	%f615, [%rd153];
	sub.ftz.f32 	%f616, %f615, %f613;
	st.local.f32 	[%rd153], %f616;
	mov.u32 	%r863, 0;
	st.local.u32 	[%rd152], %r863;
	setp.lt.s32	%p165, %r854, 1;
	@%p165 bra 	BB8_234;

	mul.lo.s32 	%r690, %r14, %r854;
	mul.wide.s32 	%rd346, %r690, 4;
	add.s64 	%rd448, %rd5, %rd346;
	mov.u32 	%r855, 0;
	mov.u64 	%rd449, %rd151;

BB8_233:
	ld.local.f32 	%f617, [%rd448];
	ld.local.f32 	%f618, [%rd449];
	fma.rn.ftz.f32 	%f619, %f111, %f617, %f618;
	mul.ftz.f32 	%f620, %f110, %f619;
	sub.ftz.f32 	%f621, %f617, %f620;
	st.local.f32 	[%rd448], %f621;
	mul.ftz.f32 	%f622, %f111, %f618;
	sub.ftz.f32 	%f623, %f617, %f622;
	ld.local.f32 	%f624, [%rd449];
	fma.rn.ftz.f32 	%f625, %f110, %f623, %f624;
	st.local.f32 	[%rd449], %f625;
	add.s64 	%rd449, %rd449, 4;
	add.s64 	%rd448, %rd448, 4;
	add.s32 	%r855, %r855, 1;
	setp.lt.s32	%p166, %r855, %r854;
	@%p166 bra 	BB8_233;

BB8_234:
	add.s32 	%r857, %r854, 1;
	setp.ge.s32	%p167, %r857, %r853;
	@%p167 bra 	BB8_237;

	add.s32 	%r691, %r307, %r854;
	mul.wide.s32 	%rd347, %r691, 4;
	add.s64 	%rd450, %rd47, %rd347;
	add.s32 	%r692, %r854, 1;
	mad.lo.s32 	%r856, %r14, %r692, %r854;

BB8_236:
	mul.wide.s32 	%rd348, %r856, 4;
	add.s64 	%rd349, %rd5, %rd348;
	ld.local.f32 	%f626, [%rd349];
	ld.local.f32 	%f627, [%rd450];
	fma.rn.ftz.f32 	%f628, %f111, %f626, %f627;
	mul.ftz.f32 	%f629, %f110, %f628;
	sub.ftz.f32 	%f630, %f626, %f629;
	st.local.f32 	[%rd349], %f630;
	mul.ftz.f32 	%f631, %f111, %f627;
	sub.ftz.f32 	%f632, %f626, %f631;
	ld.local.f32 	%f633, [%rd450];
	fma.rn.ftz.f32 	%f634, %f110, %f632, %f633;
	st.local.f32 	[%rd450], %f634;
	add.s64 	%rd450, %rd450, 4;
	add.s32 	%r856, %r856, %r14;
	add.s32 	%r857, %r857, 1;
	setp.lt.s32	%p168, %r857, %r853;
	@%p168 bra 	BB8_236;

BB8_237:
	setp.ge.s32	%p169, %r684, %r14;
	@%p169 bra 	BB8_240;

	add.s32 	%r858, %r308, %r854;
	mov.u32 	%r859, %r309;
	mov.u32 	%r860, %r684;

BB8_239:
	mul.wide.s32 	%rd350, %r858, 4;
	add.s64 	%rd351, %rd5, %rd350;
	mul.wide.s32 	%rd352, %r859, 4;
	add.s64 	%rd353, %rd5, %rd352;
	ld.local.f32 	%f635, [%rd351];
	ld.local.f32 	%f636, [%rd353];
	fma.rn.ftz.f32 	%f637, %f111, %f635, %f636;
	mul.ftz.f32 	%f638, %f110, %f637;
	sub.ftz.f32 	%f639, %f635, %f638;
	st.local.f32 	[%rd351], %f639;
	mul.ftz.f32 	%f640, %f111, %f636;
	sub.ftz.f32 	%f641, %f635, %f640;
	ld.local.f32 	%f642, [%rd353];
	fma.rn.ftz.f32 	%f643, %f110, %f641, %f642;
	st.local.f32 	[%rd353], %f643;
	add.s32 	%r859, %r859, %r14;
	add.s32 	%r858, %r858, %r14;
	add.s32 	%r860, %r860, 1;
	setp.lt.s32	%p170, %r860, %r14;
	@%p170 bra 	BB8_239;

BB8_240:
	mul.lo.s32 	%r861, %r312, %r9;
	mov.u32 	%r862, %r310;

BB8_241:
	add.s32 	%r695, %r861, %r8;
	mul.wide.s32 	%rd354, %r695, 4;
	add.s64 	%rd355, %rd1, %rd354;
	add.s32 	%r696, %r862, %r8;
	mul.wide.s32 	%rd356, %r696, 4;
	add.s64 	%rd357, %rd1, %rd356;
	ld.global.f32 	%f644, [%rd355];
	ld.global.f32 	%f645, [%rd357];
	fma.rn.ftz.f32 	%f646, %f111, %f644, %f645;
	mul.ftz.f32 	%f647, %f110, %f646;
	sub.ftz.f32 	%f648, %f644, %f647;
	st.global.f32 	[%rd355], %f648;
	mul.ftz.f32 	%f649, %f111, %f645;
	sub.ftz.f32 	%f650, %f644, %f649;
	ld.global.f32 	%f651, [%rd357];
	fma.rn.ftz.f32 	%f652, %f110, %f650, %f651;
	st.global.f32 	[%rd357], %f652;
	add.s32 	%r862, %r862, %r9;
	add.s32 	%r861, %r861, %r9;
	add.s32 	%r863, %r863, 1;
	setp.lt.s32	%p171, %r863, %r14;
	@%p171 bra 	BB8_241;

BB8_242:
	add.s32 	%r854, %r854, 1;
	setp.lt.s32	%p172, %r854, %r853;
	@%p172 bra 	BB8_224;

BB8_243:
	add.s32 	%r853, %r853, 1;
	mov.u32 	%r864, 1;
	setp.lt.s32	%p173, %r853, %r14;
	mov.f32 	%f761, 0f00000000;
	@%p173 bra 	BB8_222;

BB8_244:
	setp.lt.s32	%p174, %r864, 1;
	@%p174 bra 	BB8_247;

	mul.lo.s32 	%r699, %r14, %r864;
	mul.wide.s32 	%rd358, %r699, 4;
	add.s64 	%rd451, %rd5, %rd358;
	mov.u32 	%r865, 0;

BB8_246:
	ld.local.f32 	%f654, [%rd451];
	abs.ftz.f32 	%f655, %f654;
	add.ftz.f32 	%f761, %f761, %f655;
	add.s64 	%rd451, %rd451, 4;
	add.s32 	%r865, %r865, 1;
	setp.lt.s32	%p175, %r865, %r864;
	@%p175 bra 	BB8_246;

BB8_247:
	add.s32 	%r864, %r864, 1;
	setp.lt.s32	%p176, %r864, %r14;
	@%p176 bra 	BB8_244;

	setp.lt.ftz.f32	%p177, %f761, 0f33D6BF95;
	@%p177 bra 	BB8_272;

	mov.u32 	%r866, 1;

BB8_250:
	add.s32 	%r345, %r866, 1;
	setp.lt.s32	%p178, %r866, 1;
	@%p178 bra 	BB8_271;

	mul.lo.s32 	%r346, %r866, %r14;
	add.s32 	%r702, %r346, %r866;
	mul.wide.s32 	%rd359, %r702, 4;
	add.s64 	%rd167, %rd5, %rd359;
	mul.wide.s32 	%rd360, %r346, 4;
	add.s64 	%rd168, %rd5, %rd360;
	add.s32 	%r703, %r866, 1;
	mul.lo.s32 	%r347, %r14, %r703;
	add.s32 	%r348, %r866, %r347;
	mul.lo.s32 	%r349, %r346, %r9;
	mov.u32 	%r867, 0;

BB8_252:
	add.s32 	%r704, %r867, %r346;
	mul.wide.s32 	%rd361, %r704, 4;
	add.s64 	%rd169, %rd5, %rd361;
	ld.local.f32 	%f116, [%rd169];
	abs.ftz.f32 	%f117, %f116;
	ld.local.f32 	%f118, [%rd167];
	abs.ftz.f32 	%f656, %f118;
	mul.ftz.f32 	%f657, %f656, 0f3089705F;
	setp.gtu.ftz.f32	%p179, %f117, %f657;
	mul.lo.s32 	%r351, %r867, %r14;
	add.s32 	%r705, %r351, %r867;
	mul.wide.s32 	%rd362, %r705, 4;
	add.s64 	%rd170, %rd5, %rd362;
	@%p179 bra 	BB8_255;

	ld.local.f32 	%f658, [%rd170];
	abs.ftz.f32 	%f659, %f658;
	mul.ftz.f32 	%f660, %f659, 0f3089705F;
	setp.gtu.ftz.f32	%p180, %f117, %f660;
	@%p180 bra 	BB8_255;
	bra.uni 	BB8_254;

BB8_255:
	setp.eq.ftz.f32	%p181, %f116, 0f00000000;
	@%p181 bra 	BB8_270;

	ld.local.f32 	%f661, [%rd170];
	sub.ftz.f32 	%f119, %f118, %f661;
	abs.ftz.f32 	%f662, %f119;
	mul.ftz.f32 	%f663, %f662, 0f3089705F;
	setp.gt.ftz.f32	%p182, %f117, %f663;
	@%p182 bra 	BB8_258;
	bra.uni 	BB8_257;

BB8_258:
	mul.ftz.f32 	%f664, %f119, 0f3F000000;
	div.approx.ftz.f32 	%f665, %f664, %f116;
	abs.ftz.f32 	%f666, %f665;
	fma.rn.ftz.f32 	%f667, %f665, %f665, 0f3F800000;
	sqrt.approx.ftz.f32 	%f668, %f667;
	add.ftz.f32 	%f669, %f666, %f668;
	rcp.approx.ftz.f32 	%f670, %f669;
	setp.lt.ftz.f32	%p183, %f665, 0f00000000;
	neg.ftz.f32 	%f671, %f670;
	selp.f32	%f762, %f671, %f670, %p183;
	bra.uni 	BB8_259;

BB8_254:
	mov.u32 	%r706, 0;
	st.local.u32 	[%rd169], %r706;
	bra.uni 	BB8_270;

BB8_257:
	div.approx.ftz.f32 	%f762, %f116, %f119;

BB8_259:
	fma.rn.ftz.f32 	%f672, %f762, %f762, 0f3F800000;
	rsqrt.approx.ftz.f32 	%f673, %f672;
	mul.ftz.f32 	%f123, %f762, %f673;
	add.ftz.f32 	%f674, %f673, 0f3F800000;
	div.approx.ftz.f32 	%f124, %f123, %f674;
	mul.ftz.f32 	%f675, %f116, %f762;
	add.ftz.f32 	%f676, %f675, %f118;
	st.local.f32 	[%rd167], %f676;
	ld.local.f32 	%f677, [%rd170];
	sub.ftz.f32 	%f678, %f677, %f675;
	st.local.f32 	[%rd170], %f678;
	mov.u32 	%r876, 0;
	st.local.u32 	[%rd169], %r876;
	setp.lt.s32	%p184, %r867, 1;
	@%p184 bra 	BB8_262;

	mul.lo.s32 	%r709, %r14, %r867;
	mul.wide.s32 	%rd363, %r709, 4;
	add.s64 	%rd452, %rd5, %rd363;
	mov.u32 	%r868, 0;
	mov.u64 	%rd453, %rd168;

BB8_261:
	ld.local.f32 	%f679, [%rd452];
	ld.local.f32 	%f680, [%rd453];
	fma.rn.ftz.f32 	%f681, %f124, %f679, %f680;
	mul.ftz.f32 	%f682, %f123, %f681;
	sub.ftz.f32 	%f683, %f679, %f682;
	st.local.f32 	[%rd452], %f683;
	mul.ftz.f32 	%f684, %f124, %f680;
	sub.ftz.f32 	%f685, %f679, %f684;
	ld.local.f32 	%f686, [%rd453];
	fma.rn.ftz.f32 	%f687, %f123, %f685, %f686;
	st.local.f32 	[%rd453], %f687;
	add.s64 	%rd453, %rd453, 4;
	add.s64 	%rd452, %rd452, 4;
	add.s32 	%r868, %r868, 1;
	setp.lt.s32	%p185, %r868, %r867;
	@%p185 bra 	BB8_261;

BB8_262:
	add.s32 	%r870, %r867, 1;
	setp.ge.s32	%p186, %r870, %r866;
	@%p186 bra 	BB8_265;

	add.s32 	%r710, %r346, %r867;
	mul.wide.s32 	%rd364, %r710, 4;
	add.s64 	%rd454, %rd47, %rd364;
	add.s32 	%r711, %r867, 1;
	mad.lo.s32 	%r869, %r14, %r711, %r867;

BB8_264:
	mul.wide.s32 	%rd365, %r869, 4;
	add.s64 	%rd366, %rd5, %rd365;
	ld.local.f32 	%f688, [%rd366];
	ld.local.f32 	%f689, [%rd454];
	fma.rn.ftz.f32 	%f690, %f124, %f688, %f689;
	mul.ftz.f32 	%f691, %f123, %f690;
	sub.ftz.f32 	%f692, %f688, %f691;
	st.local.f32 	[%rd366], %f692;
	mul.ftz.f32 	%f693, %f124, %f689;
	sub.ftz.f32 	%f694, %f688, %f693;
	ld.local.f32 	%f695, [%rd454];
	fma.rn.ftz.f32 	%f696, %f123, %f694, %f695;
	st.local.f32 	[%rd454], %f696;
	add.s64 	%rd454, %rd454, 4;
	add.s32 	%r869, %r869, %r14;
	add.s32 	%r870, %r870, 1;
	setp.lt.s32	%p187, %r870, %r866;
	@%p187 bra 	BB8_264;

BB8_265:
	setp.ge.s32	%p188, %r345, %r14;
	@%p188 bra 	BB8_268;

	add.s32 	%r871, %r347, %r867;
	mov.u32 	%r872, %r348;
	mov.u32 	%r873, %r345;

BB8_267:
	mul.wide.s32 	%rd367, %r871, 4;
	add.s64 	%rd368, %rd5, %rd367;
	mul.wide.s32 	%rd369, %r872, 4;
	add.s64 	%rd370, %rd5, %rd369;
	ld.local.f32 	%f697, [%rd368];
	ld.local.f32 	%f698, [%rd370];
	fma.rn.ftz.f32 	%f699, %f124, %f697, %f698;
	mul.ftz.f32 	%f700, %f123, %f699;
	sub.ftz.f32 	%f701, %f697, %f700;
	st.local.f32 	[%rd368], %f701;
	mul.ftz.f32 	%f702, %f124, %f698;
	sub.ftz.f32 	%f703, %f697, %f702;
	ld.local.f32 	%f704, [%rd370];
	fma.rn.ftz.f32 	%f705, %f123, %f703, %f704;
	st.local.f32 	[%rd370], %f705;
	add.s32 	%r872, %r872, %r14;
	add.s32 	%r871, %r871, %r14;
	add.s32 	%r873, %r873, 1;
	setp.lt.s32	%p189, %r873, %r14;
	@%p189 bra 	BB8_267;

BB8_268:
	mul.lo.s32 	%r874, %r351, %r9;
	mov.u32 	%r875, %r349;

BB8_269:
	add.s32 	%r713, %r874, %r8;
	mul.wide.s32 	%rd371, %r713, 4;
	add.s64 	%rd372, %rd1, %rd371;
	add.s32 	%r714, %r875, %r8;
	mul.wide.s32 	%rd373, %r714, 4;
	add.s64 	%rd374, %rd1, %rd373;
	ld.global.f32 	%f706, [%rd372];
	ld.global.f32 	%f707, [%rd374];
	fma.rn.ftz.f32 	%f708, %f124, %f706, %f707;
	mul.ftz.f32 	%f709, %f123, %f708;
	sub.ftz.f32 	%f710, %f706, %f709;
	st.global.f32 	[%rd372], %f710;
	mul.ftz.f32 	%f711, %f124, %f707;
	sub.ftz.f32 	%f712, %f706, %f711;
	ld.global.f32 	%f713, [%rd374];
	fma.rn.ftz.f32 	%f714, %f123, %f712, %f713;
	st.global.f32 	[%rd374], %f714;
	add.s32 	%r875, %r875, %r9;
	add.s32 	%r874, %r874, %r9;
	add.s32 	%r876, %r876, 1;
	setp.lt.s32	%p190, %r876, %r14;
	@%p190 bra 	BB8_269;

BB8_270:
	add.s32 	%r867, %r867, 1;
	setp.lt.s32	%p191, %r867, %r866;
	@%p191 bra 	BB8_252;

BB8_271:
	add.s32 	%r866, %r866, 1;
	setp.lt.s32	%p192, %r866, %r14;
	@%p192 bra 	BB8_250;

BB8_272:
	add.s32 	%r376, %r14, -1;
	setp.lt.s32	%p193, %r376, 1;
	@%p193 bra 	BB8_281;

	add.s32 	%r377, %r14, 1;
	mov.u32 	%r877, 0;

BB8_274:
	mul.lo.s32 	%r381, %r877, %r14;
	add.s32 	%r716, %r381, %r877;
	mul.wide.s32 	%rd375, %r716, 4;
	add.s64 	%rd180, %rd5, %rd375;
	ld.local.f32 	%f125, [%rd180];
	setp.ge.s32	%p194, %r877, %r14;
	mov.u32 	%r881, %r877;
	mov.f32 	%f764, %f125;
	@%p194 bra 	BB8_277;

	mul.lo.s32 	%r878, %r377, %r877;
	mov.u32 	%r879, %r877;
	mov.u32 	%r881, %r877;
	mov.f32 	%f764, %f125;

BB8_276:
	mul.wide.s32 	%rd376, %r878, 4;
	add.s64 	%rd377, %rd5, %rd376;
	ld.local.f32 	%f715, [%rd377];
	setp.ltu.ftz.f32	%p195, %f715, %f764;
	selp.f32	%f764, %f764, %f715, %p195;
	selp.b32	%r881, %r881, %r879, %p195;
	add.s32 	%r878, %r878, %r377;
	add.s32 	%r879, %r879, 1;
	setp.lt.s32	%p196, %r879, %r14;
	@%p196 bra 	BB8_276;

BB8_277:
	setp.eq.s32	%p197, %r881, %r877;
	@%p197 bra 	BB8_280;

	mul.lo.s32 	%r718, %r881, %r14;
	add.s32 	%r719, %r718, %r881;
	mul.wide.s32 	%rd378, %r719, 4;
	add.s64 	%rd379, %rd5, %rd378;
	st.local.f32 	[%rd379], %f125;
	st.local.f32 	[%rd180], %f764;
	mul.lo.s32 	%r883, %r381, %r9;
	mul.lo.s32 	%r882, %r718, %r9;
	mov.u32 	%r884, 0;

BB8_279:
	add.s32 	%r720, %r883, %r8;
	mul.wide.s32 	%rd380, %r720, 4;
	add.s64 	%rd381, %rd1, %rd380;
	ld.global.f32 	%f716, [%rd381];
	add.s32 	%r721, %r882, %r8;
	mul.wide.s32 	%rd382, %r721, 4;
	add.s64 	%rd383, %rd1, %rd382;
	ld.global.f32 	%f717, [%rd383];
	st.global.f32 	[%rd381], %f717;
	st.global.f32 	[%rd383], %f716;
	add.s32 	%r883, %r883, %r9;
	add.s32 	%r882, %r882, %r9;
	add.s32 	%r884, %r884, 1;
	setp.lt.s32	%p198, %r884, %r14;
	@%p198 bra 	BB8_279;

BB8_280:
	add.s32 	%r877, %r877, 1;
	setp.lt.s32	%p199, %r877, %r376;
	@%p199 bra 	BB8_274;

BB8_281:
	sub.s32 	%r748, %r18, %r15;
	sub.s32 	%r747, %r19, %r17;
	mul.lo.s32 	%r746, %r747, %r748;
	mul.lo.s32 	%r745, %r746, %r20;
	ld.param.u64 	%rd409, [kernel_cuda_filter_construct_transform_param_3];
	cvta.to.global.u64 	%rd384, %rd409;
	mul.wide.s32 	%rd385, %r8, 4;
	add.s64 	%rd181, %rd384, %rd385;
	mov.u32 	%r885, 0;
	st.global.u32 	[%rd181], %r885;
	mul.hi.s32 	%r723, %r745, 1431655766;
	shr.u32 	%r724, %r723, 31;
	add.s32 	%r725, %r723, %r724;
	min.s32 	%r399, %r14, %r725;
	setp.lt.ftz.f32	%p200, %f135, 0f00000000;
	@%p200 bra 	BB8_287;
	bra.uni 	BB8_282;

BB8_287:
	add.s32 	%r404, %r14, 1;
	mov.f32 	%f765, 0f00000000;
	mov.u32 	%r887, 0;
	mov.u32 	%r888, %r887;

BB8_288:
	mul.wide.s32 	%rd391, %r887, 4;
	add.s64 	%rd392, %rd5, %rd391;
	ld.local.f32 	%f721, [%rd392];
	add.ftz.f32 	%f765, %f765, %f721;
	add.s32 	%r887, %r887, %r404;
	add.s32 	%r888, %r888, 1;
	setp.lt.s32	%p205, %r888, %r14;
	@%p205 bra 	BB8_288;

	add.ftz.f32 	%f723, %f135, 0f3F800000;
	mul.ftz.f32 	%f131, %f723, %f765;
	mov.f32 	%f766, 0f00000000;
	mov.u32 	%r889, 0;
	setp.lt.s32	%p206, %r399, 1;
	@%p206 bra 	BB8_293;

	mov.u32 	%r890, %r889;

BB8_291:
	setp.ge.ftz.f32	%p207, %f766, %f131;
	setp.gt.s32	%p208, %r890, 1;
	and.pred  	%p209, %p208, %p207;
	@%p209 bra 	BB8_293;

	mad.lo.s32 	%r733, %r890, %r14, %r890;
	mul.wide.s32 	%rd393, %r733, 4;
	add.s64 	%rd394, %rd5, %rd393;
	ld.local.f32 	%f724, [%rd394];
	add.ftz.f32 	%f766, %f766, %f724;
	add.s32 	%r889, %r889, 1;
	st.global.u32 	[%rd181], %r889;
	add.s32 	%r890, %r890, 1;
	setp.lt.s32	%p210, %r890, %r399;
	@%p210 bra 	BB8_291;
	bra.uni 	BB8_293;

BB8_282:
	setp.lt.s32	%p201, %r399, 1;
	@%p201 bra 	BB8_293;

	mov.u32 	%r886, %r885;

BB8_284:
	setp.lt.s32	%p202, %r886, 2;
	@%p202 bra 	BB8_286;

	mad.lo.s32 	%r728, %r886, %r14, %r886;
	mul.wide.s32 	%rd386, %r728, 4;
	add.s64 	%rd387, %rd5, %rd386;
	ld.local.f32 	%f718, [%rd387];
	sqrt.approx.ftz.f32 	%f719, %f718;
	setp.lt.ftz.f32	%p203, %f719, %f135;
	@%p203 bra 	BB8_293;

BB8_286:
	add.s32 	%r886, %r886, 1;
	add.s32 	%r885, %r885, 1;
	st.global.u32 	[%rd181], %r885;
	setp.lt.s32	%p204, %r886, %r399;
	@%p204 bra 	BB8_284;

BB8_293:
	mov.u32 	%r891, 0;

BB8_294:
	setp.lt.s32	%p211, %r891, 1;
	@%p211 bra 	BB8_297;

	mul.lo.s32 	%r893, %r9, %r891;
	mul.lo.s32 	%r892, %r893, %r14;
	mov.u32 	%r894, 0;

BB8_296:
	add.s32 	%r736, %r892, %r8;
	mul.wide.s32 	%rd398, %r736, 4;
	add.s64 	%rd399, %rd1, %rd398;
	ld.global.f32 	%f725, [%rd399];
	add.s32 	%r737, %r893, %r8;
	mul.wide.s32 	%rd400, %r737, 4;
	add.s64 	%rd401, %rd1, %rd400;
	ld.global.f32 	%f726, [%rd401];
	st.global.f32 	[%rd399], %f726;
	st.global.f32 	[%rd401], %f725;
	add.s32 	%r893, %r893, %r67;
	add.s32 	%r892, %r892, %r9;
	add.s32 	%r894, %r894, 1;
	setp.lt.s32	%p212, %r894, %r891;
	@%p212 bra 	BB8_296;

BB8_297:
	add.s32 	%r891, %r891, 1;
	setp.lt.s32	%p213, %r891, %r14;
	@%p213 bra 	BB8_294;

	ld.global.u32 	%r898, [%rd181];
	mov.u32 	%r896, 0;

BB8_299:
	setp.lt.s32	%p214, %r898, 1;
	@%p214 bra 	BB8_302;

	mul.wide.s32 	%rd405, %r896, 4;
	add.s64 	%rd406, %rd4, %rd405;
	ld.local.f32 	%f134, [%rd406];
	mul.lo.s32 	%r429, %r896, %r14;
	mov.u32 	%r897, 0;

BB8_301:
	add.s32 	%r740, %r897, %r429;
	mad.lo.s32 	%r741, %r740, %r9, %r8;
	mul.wide.s32 	%rd407, %r741, 4;
	add.s64 	%rd408, %rd1, %rd407;
	ld.global.f32 	%f727, [%rd408];
	mul.ftz.f32 	%f728, %f134, %f727;
	st.global.f32 	[%rd408], %f728;
	ld.global.u32 	%r898, [%rd181];
	add.s32 	%r897, %r897, 1;
	setp.lt.s32	%p215, %r897, %r898;
	@%p215 bra 	BB8_301;

BB8_302:
	add.s32 	%r896, %r896, 1;
	setp.lt.s32	%p216, %r896, %r14;
	@%p216 bra 	BB8_299;

BB8_303:
	ret;
}

	// .globl	kernel_cuda_filter_nlm_calc_difference
.visible .entry kernel_cuda_filter_nlm_calc_difference(
	.param .u64 kernel_cuda_filter_nlm_calc_difference_param_0,
	.param .u64 kernel_cuda_filter_nlm_calc_difference_param_1,
	.param .u64 kernel_cuda_filter_nlm_calc_difference_param_2,
	.param .u64 kernel_cuda_filter_nlm_calc_difference_param_3,
	.param .u32 kernel_cuda_filter_nlm_calc_difference_param_4,
	.param .u32 kernel_cuda_filter_nlm_calc_difference_param_5,
	.param .u32 kernel_cuda_filter_nlm_calc_difference_param_6,
	.param .u32 kernel_cuda_filter_nlm_calc_difference_param_7,
	.param .u32 kernel_cuda_filter_nlm_calc_difference_param_8,
	.param .u32 kernel_cuda_filter_nlm_calc_difference_param_9,
	.param .u32 kernel_cuda_filter_nlm_calc_difference_param_10,
	.param .f32 kernel_cuda_filter_nlm_calc_difference_param_11,
	.param .f32 kernel_cuda_filter_nlm_calc_difference_param_12
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<16>;
	.reg .f32 	%f<38>;
	.reg .b32 	%r<66>;
	.reg .b64 	%rd<42>;


	ld.param.u64 	%rd17, [kernel_cuda_filter_nlm_calc_difference_param_0];
	ld.param.u64 	%rd18, [kernel_cuda_filter_nlm_calc_difference_param_1];
	ld.param.u64 	%rd19, [kernel_cuda_filter_nlm_calc_difference_param_2];
	ld.param.u64 	%rd20, [kernel_cuda_filter_nlm_calc_difference_param_3];
	ld.param.u32 	%r21, [kernel_cuda_filter_nlm_calc_difference_param_4];
	ld.param.u32 	%r22, [kernel_cuda_filter_nlm_calc_difference_param_5];
	ld.param.u32 	%r23, [kernel_cuda_filter_nlm_calc_difference_param_6];
	ld.param.u32 	%r24, [kernel_cuda_filter_nlm_calc_difference_param_7];
	ld.param.u32 	%r25, [kernel_cuda_filter_nlm_calc_difference_param_8];
	ld.param.u32 	%r26, [kernel_cuda_filter_nlm_calc_difference_param_9];
	ld.param.u32 	%r27, [kernel_cuda_filter_nlm_calc_difference_param_10];
	ld.param.f32 	%f8, [kernel_cuda_filter_nlm_calc_difference_param_11];
	ld.param.f32 	%f9, [kernel_cuda_filter_nlm_calc_difference_param_12];
	shl.b32 	%r29, %r25, 1;
	add.s32 	%r1, %r29, 1;
	mov.u32 	%r30, %ntid.y;
	mov.u32 	%r31, %ctaid.y;
	mov.u32 	%r32, %tid.y;
	mad.lo.s32 	%r33, %r30, %r31, %r32;
	rem.s32 	%r2, %r33, %r1;
	div.s32 	%r3, %r33, %r1;
	mov.pred 	%p15, -1;
	mov.u64 	%rd37, 0;
	setp.ge.s32	%p3, %r3, %r1;
	@%p3 bra 	BB9_4;

	sub.s32 	%r62, %r2, %r25;
	neg.s32 	%r36, %r62;
	mov.u32 	%r37, 0;
	max.s32 	%r4, %r37, %r36;
	sub.s32 	%r61, %r3, %r25;
	neg.s32 	%r38, %r61;
	max.s32 	%r6, %r37, %r38;
	max.s32 	%r39, %r37, %r62;
	sub.s32 	%r40, %r21, %r39;
	max.s32 	%r41, %r37, %r61;
	sub.s32 	%r42, %r22, %r41;
	min.s32 	%r7, %r21, %r40;
	min.s32 	%r8, %r22, %r42;
	setp.le.s32	%p5, %r8, %r6;
	setp.le.s32	%p6, %r7, %r4;
	or.pred  	%p7, %p5, %p6;
	@%p7 bra 	BB9_4;

	mov.u32 	%r44, %ctaid.x;
	mov.u32 	%r45, %ntid.x;
	mov.u32 	%r46, %tid.x;
	mad.lo.s32 	%r47, %r45, %r44, %r46;
	sub.s32 	%r48, %r7, %r4;
	rem.s32 	%r49, %r47, %r48;
	add.s32 	%r64, %r49, %r4;
	div.s32 	%r50, %r47, %r48;
	add.s32 	%r63, %r50, %r6;
	setp.ge.s32	%p9, %r63, %r8;
	@%p9 bra 	BB9_4;

	mad.lo.s32 	%r51, %r3, %r1, %r2;
	mul.lo.s32 	%r52, %r51, %r24;
	cvt.s64.s32	%rd37, %r52;
	mov.pred 	%p15, 0;

BB9_4:
	@%p15 bra 	BB9_12;

	mad.lo.s32 	%r16, %r63, %r23, %r64;
	add.s32 	%r17, %r62, %r27;
	setp.eq.s32	%p11, %r26, 0;
	selp.b32	%r18, 1, 3, %p11;
	setp.eq.s64	%p12, %rd19, 0;
	mov.f32 	%f35, 0f3F800000;
	@%p12 bra 	BB9_7;

	add.s32 	%r53, %r17, %r64;
	add.s32 	%r54, %r63, %r61;
	mad.lo.s32 	%r55, %r54, %r23, %r53;
	cvta.to.global.u64 	%rd24, %rd19;
	mul.wide.s32 	%rd25, %r16, 4;
	add.s64 	%rd26, %rd24, %rd25;
	mul.wide.s32 	%rd27, %r55, 4;
	add.s64 	%rd28, %rd24, %rd27;
	ld.global.nc.f32 	%f11, [%rd28];
	ld.global.nc.f32 	%f12, [%rd26];
	div.approx.ftz.f32 	%f13, %f12, %f11;
	mov.f32 	%f14, 0f3E800000;
	max.ftz.f32 	%f15, %f13, %f14;
	mov.f32 	%f16, 0f40800000;
	min.ftz.f32 	%f35, %f15, %f16;

BB9_7:
	cvta.to.global.u64 	%rd29, %rd17;
	mul.ftz.f32 	%f3, %f35, %f35;
	add.s32 	%r57, %r63, %r61;
	add.s32 	%r58, %r17, %r64;
	mad.lo.s32 	%r59, %r23, %r57, %r58;
	cvta.to.global.u64 	%rd30, %rd18;
	mul.wide.s32 	%rd31, %r59, 4;
	add.s64 	%rd41, %rd30, %rd31;
	mul.wide.s32 	%rd4, %r26, 4;
	add.s64 	%rd40, %rd29, %rd31;
	mul.wide.s32 	%rd32, %r16, 4;
	add.s64 	%rd39, %rd30, %rd32;
	add.s64 	%rd38, %rd29, %rd32;
	cvta.to.global.u64 	%rd8, %rd20;
	mov.f32 	%f36, 0f00000000;
	mov.u32 	%r65, 0;

BB9_8:
	ld.global.nc.f32 	%f18, [%rd40];
	mul.ftz.f32 	%f19, %f35, %f18;
	ld.global.nc.f32 	%f20, [%rd38];
	sub.ftz.f32 	%f21, %f20, %f19;
	ld.global.nc.f32 	%f22, [%rd41];
	mul.ftz.f32 	%f23, %f3, %f22;
	mul.ftz.f32 	%f24, %f21, %f21;
	ld.global.nc.f32 	%f25, [%rd39];
	min.ftz.f32 	%f26, %f25, %f23;
	add.ftz.f32 	%f27, %f25, %f26;
	mul.ftz.f32 	%f28, %f27, %f8;
	sub.ftz.f32 	%f29, %f24, %f28;
	add.ftz.f32 	%f30, %f25, %f23;
	fma.rn.ftz.f32 	%f31, %f30, %f9, 0f322BCC77;
	div.approx.ftz.f32 	%f32, %f29, %f31;
	add.ftz.f32 	%f36, %f36, %f32;
	add.s64 	%rd41, %rd41, %rd4;
	add.s64 	%rd40, %rd40, %rd4;
	add.s64 	%rd39, %rd39, %rd4;
	add.s64 	%rd38, %rd38, %rd4;
	add.s32 	%r65, %r65, 1;
	setp.lt.s32	%p13, %r65, %r18;
	@%p13 bra 	BB9_8;

	@%p11 bra 	BB9_11;

	cvt.rn.f32.s32	%f33, %r18;
	rcp.approx.ftz.f32 	%f34, %f33;
	mul.ftz.f32 	%f36, %f36, %f34;

BB9_11:
	cvt.s64.s32	%rd33, %r16;
	add.s64 	%rd34, %rd33, %rd37;
	shl.b64 	%rd35, %rd34, 2;
	add.s64 	%rd36, %rd8, %rd35;
	st.global.f32 	[%rd36], %f36;

BB9_12:
	ret;
}

	// .globl	kernel_cuda_filter_nlm_blur
.visible .entry kernel_cuda_filter_nlm_blur(
	.param .u64 kernel_cuda_filter_nlm_blur_param_0,
	.param .u64 kernel_cuda_filter_nlm_blur_param_1,
	.param .u32 kernel_cuda_filter_nlm_blur_param_2,
	.param .u32 kernel_cuda_filter_nlm_blur_param_3,
	.param .u32 kernel_cuda_filter_nlm_blur_param_4,
	.param .u32 kernel_cuda_filter_nlm_blur_param_5,
	.param .u32 kernel_cuda_filter_nlm_blur_param_6,
	.param .u32 kernel_cuda_filter_nlm_blur_param_7
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<18>;
	.reg .f32 	%f<32>;
	.reg .b32 	%r<80>;
	.reg .b64 	%rd<35>;


	ld.param.u64 	%rd6, [kernel_cuda_filter_nlm_blur_param_0];
	ld.param.u64 	%rd4, [kernel_cuda_filter_nlm_blur_param_1];
	ld.param.u32 	%r32, [kernel_cuda_filter_nlm_blur_param_2];
	ld.param.u32 	%r33, [kernel_cuda_filter_nlm_blur_param_3];
	ld.param.u32 	%r34, [kernel_cuda_filter_nlm_blur_param_4];
	ld.param.u32 	%r35, [kernel_cuda_filter_nlm_blur_param_5];
	ld.param.u32 	%r36, [kernel_cuda_filter_nlm_blur_param_6];
	ld.param.u32 	%r37, [kernel_cuda_filter_nlm_blur_param_7];
	cvta.to.global.u64 	%rd1, %rd6;
	shl.b32 	%r39, %r36, 1;
	add.s32 	%r1, %r39, 1;
	mov.u32 	%r40, %ntid.y;
	mov.u32 	%r41, %ctaid.y;
	mov.u32 	%r42, %tid.y;
	mad.lo.s32 	%r43, %r40, %r41, %r42;
	rem.s32 	%r2, %r43, %r1;
	div.s32 	%r3, %r43, %r1;
	mov.pred 	%p17, -1;
	mov.u64 	%rd34, 0;
	setp.ge.s32	%p3, %r3, %r1;
	@%p3 bra 	BB10_4;

	sub.s32 	%r45, %r2, %r36;
	neg.s32 	%r46, %r45;
	mov.u32 	%r47, 0;
	max.s32 	%r4, %r47, %r46;
	sub.s32 	%r48, %r3, %r36;
	neg.s32 	%r49, %r48;
	max.s32 	%r74, %r47, %r49;
	max.s32 	%r50, %r47, %r45;
	sub.s32 	%r51, %r32, %r50;
	max.s32 	%r52, %r47, %r48;
	sub.s32 	%r73, %r33, %r52;
	min.s32 	%r7, %r32, %r51;
	min.s32 	%r8, %r33, %r73;
	setp.le.s32	%p5, %r8, %r74;
	setp.le.s32	%p6, %r7, %r4;
	or.pred  	%p7, %p5, %p6;
	@%p7 bra 	BB10_4;

	mov.u32 	%r54, %ctaid.x;
	mov.u32 	%r55, %ntid.x;
	mov.u32 	%r56, %tid.x;
	mad.lo.s32 	%r57, %r55, %r54, %r56;
	sub.s32 	%r58, %r7, %r4;
	rem.s32 	%r59, %r57, %r58;
	add.s32 	%r72, %r59, %r4;
	div.s32 	%r60, %r57, %r58;
	add.s32 	%r71, %r60, %r74;
	setp.ge.s32	%p9, %r71, %r8;
	@%p9 bra 	BB10_4;

	mad.lo.s32 	%r61, %r3, %r1, %r2;
	mul.lo.s32 	%r62, %r61, %r35;
	cvt.s64.s32	%rd34, %r62;
	mov.pred 	%p17, 0;

BB10_4:
	@%p17 bra 	BB10_16;

	sub.s32 	%r63, %r71, %r37;
	max.s32 	%r77, %r74, %r63;
	add.s32 	%r64, %r37, %r71;
	add.s32 	%r65, %r64, 1;
	min.s32 	%r16, %r73, %r65;
	sub.s32 	%r17, %r16, %r77;
	mov.f32 	%f31, 0f00000000;
	setp.le.s32	%p11, %r16, %r77;
	@%p11 bra 	BB10_15;

	and.b32  	%r18, %r17, 3;
	setp.eq.s32	%p12, %r18, 0;
	mov.f32 	%f31, 0f00000000;
	@%p12 bra 	BB10_12;

	setp.eq.s32	%p13, %r18, 1;
	mov.f32 	%f28, 0f00000000;
	@%p13 bra 	BB10_11;

	setp.eq.s32	%p14, %r18, 2;
	mov.f32 	%f27, 0f00000000;
	@%p14 bra 	BB10_10;

	mad.lo.s32 	%r66, %r77, %r34, %r72;
	cvt.s64.s32	%rd9, %r66;
	add.s64 	%rd10, %rd9, %rd34;
	shl.b64 	%rd11, %rd10, 2;
	add.s64 	%rd12, %rd1, %rd11;
	ld.global.nc.f32 	%f14, [%rd12];
	add.ftz.f32 	%f27, %f14, 0f00000000;
	add.s32 	%r77, %r77, 1;

BB10_10:
	mad.lo.s32 	%r67, %r77, %r34, %r72;
	cvt.s64.s32	%rd13, %r67;
	add.s64 	%rd14, %rd13, %rd34;
	shl.b64 	%rd15, %rd14, 2;
	add.s64 	%rd16, %rd1, %rd15;
	ld.global.nc.f32 	%f15, [%rd16];
	add.ftz.f32 	%f28, %f27, %f15;
	add.s32 	%r77, %r77, 1;

BB10_11:
	mad.lo.s32 	%r68, %r77, %r34, %r72;
	cvt.s64.s32	%rd17, %r68;
	add.s64 	%rd18, %rd17, %rd34;
	shl.b64 	%rd19, %rd18, 2;
	add.s64 	%rd20, %rd1, %rd19;
	ld.global.nc.f32 	%f16, [%rd20];
	add.ftz.f32 	%f31, %f28, %f16;
	add.s32 	%r77, %r77, 1;

BB10_12:
	setp.lt.u32	%p15, %r17, 4;
	@%p15 bra 	BB10_15;

	shl.b32 	%r25, %r34, 2;
	mad.lo.s32 	%r78, %r77, %r34, %r72;

BB10_14:
	cvt.s64.s32	%rd21, %r78;
	add.s64 	%rd22, %rd21, %rd34;
	shl.b64 	%rd23, %rd22, 2;
	add.s64 	%rd24, %rd1, %rd23;
	ld.global.nc.f32 	%f17, [%rd24];
	add.ftz.f32 	%f18, %f31, %f17;
	cvt.s64.s32	%rd25, %r25;
	add.s64 	%rd26, %rd24, %rd25;
	ld.global.nc.f32 	%f19, [%rd26];
	add.ftz.f32 	%f20, %f18, %f19;
	add.s64 	%rd27, %rd26, %rd25;
	ld.global.nc.f32 	%f21, [%rd27];
	add.ftz.f32 	%f22, %f20, %f21;
	add.s64 	%rd28, %rd27, %rd25;
	ld.global.nc.f32 	%f23, [%rd28];
	add.ftz.f32 	%f31, %f22, %f23;
	add.s32 	%r78, %r78, %r25;
	add.s32 	%r77, %r77, 4;
	setp.lt.s32	%p16, %r77, %r16;
	@%p16 bra 	BB10_14;

BB10_15:
	cvt.rn.f32.s32	%f24, %r17;
	rcp.approx.ftz.f32 	%f25, %f24;
	mul.ftz.f32 	%f26, %f31, %f25;
	mad.lo.s32 	%r70, %r71, %r34, %r72;
	cvt.s64.s32	%rd29, %r70;
	add.s64 	%rd30, %rd34, %rd29;
	cvta.to.global.u64 	%rd31, %rd4;
	shl.b64 	%rd32, %rd30, 2;
	add.s64 	%rd33, %rd31, %rd32;
	st.global.f32 	[%rd33], %f26;

BB10_16:
	ret;
}

	// .globl	kernel_cuda_filter_nlm_calc_weight
.visible .entry kernel_cuda_filter_nlm_calc_weight(
	.param .u64 kernel_cuda_filter_nlm_calc_weight_param_0,
	.param .u64 kernel_cuda_filter_nlm_calc_weight_param_1,
	.param .u32 kernel_cuda_filter_nlm_calc_weight_param_2,
	.param .u32 kernel_cuda_filter_nlm_calc_weight_param_3,
	.param .u32 kernel_cuda_filter_nlm_calc_weight_param_4,
	.param .u32 kernel_cuda_filter_nlm_calc_weight_param_5,
	.param .u32 kernel_cuda_filter_nlm_calc_weight_param_6,
	.param .u32 kernel_cuda_filter_nlm_calc_weight_param_7
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<18>;
	.reg .f32 	%f<52>;
	.reg .b32 	%r<80>;
	.reg .b64 	%rd<34>;


	ld.param.u64 	%rd9, [kernel_cuda_filter_nlm_calc_weight_param_0];
	ld.param.u64 	%rd7, [kernel_cuda_filter_nlm_calc_weight_param_1];
	ld.param.u32 	%r29, [kernel_cuda_filter_nlm_calc_weight_param_2];
	ld.param.u32 	%r30, [kernel_cuda_filter_nlm_calc_weight_param_3];
	ld.param.u32 	%r31, [kernel_cuda_filter_nlm_calc_weight_param_4];
	ld.param.u32 	%r32, [kernel_cuda_filter_nlm_calc_weight_param_5];
	ld.param.u32 	%r33, [kernel_cuda_filter_nlm_calc_weight_param_6];
	ld.param.u32 	%r34, [kernel_cuda_filter_nlm_calc_weight_param_7];
	cvta.to.global.u64 	%rd1, %rd9;
	shl.b32 	%r36, %r33, 1;
	add.s32 	%r1, %r36, 1;
	mov.u32 	%r37, %ntid.y;
	mov.u32 	%r38, %ctaid.y;
	mov.u32 	%r39, %tid.y;
	mad.lo.s32 	%r40, %r37, %r38, %r39;
	rem.s32 	%r2, %r40, %r1;
	div.s32 	%r3, %r40, %r1;
	mov.pred 	%p17, -1;
	mov.u64 	%rd32, 0;
	setp.ge.s32	%p3, %r3, %r1;
	@%p3 bra 	BB11_4;

	sub.s32 	%r42, %r2, %r33;
	neg.s32 	%r43, %r42;
	mov.u32 	%r44, 0;
	max.s32 	%r75, %r44, %r43;
	sub.s32 	%r45, %r3, %r33;
	neg.s32 	%r46, %r45;
	max.s32 	%r5, %r44, %r46;
	max.s32 	%r47, %r44, %r42;
	sub.s32 	%r74, %r29, %r47;
	max.s32 	%r48, %r44, %r45;
	sub.s32 	%r49, %r30, %r48;
	min.s32 	%r7, %r29, %r74;
	min.s32 	%r8, %r30, %r49;
	setp.le.s32	%p5, %r8, %r5;
	setp.le.s32	%p6, %r7, %r75;
	or.pred  	%p7, %p5, %p6;
	@%p7 bra 	BB11_4;

	mov.u32 	%r51, %ctaid.x;
	mov.u32 	%r52, %ntid.x;
	mov.u32 	%r53, %tid.x;
	mad.lo.s32 	%r54, %r52, %r51, %r53;
	sub.s32 	%r55, %r7, %r75;
	rem.s32 	%r9, %r54, %r55;
	div.s32 	%r56, %r54, %r55;
	add.s32 	%r72, %r56, %r5;
	setp.ge.s32	%p9, %r72, %r8;
	@%p9 bra 	BB11_4;

	mad.lo.s32 	%r57, %r3, %r1, %r2;
	mul.lo.s32 	%r58, %r57, %r32;
	cvt.s64.s32	%rd32, %r58;
	add.s32 	%r73, %r9, %r75;
	mov.pred 	%p17, 0;

BB11_4:
	@%p17 bra 	BB11_16;

	sub.s32 	%r59, %r73, %r34;
	max.s32 	%r78, %r75, %r59;
	add.s32 	%r60, %r34, %r73;
	add.s32 	%r61, %r60, 1;
	min.s32 	%r17, %r74, %r61;
	mul.lo.s32 	%r18, %r72, %r31;
	sub.s32 	%r19, %r17, %r78;
	mov.f32 	%f10, 0f00000000;
	setp.le.s32	%p11, %r17, %r78;
	mov.f32 	%f51, %f10;
	@%p11 bra 	BB11_15;

	and.b32  	%r20, %r19, 3;
	setp.eq.s32	%p12, %r20, 0;
	mov.f32 	%f51, 0f00000000;
	@%p12 bra 	BB11_12;

	setp.eq.s32	%p13, %r20, 1;
	mov.f32 	%f48, 0f00000000;
	@%p13 bra 	BB11_11;

	setp.eq.s32	%p14, %r20, 2;
	mov.f32 	%f47, 0f00000000;
	@%p14 bra 	BB11_10;

	add.s32 	%r62, %r78, %r18;
	cvt.s64.s32	%rd12, %r62;
	add.s64 	%rd13, %rd12, %rd32;
	shl.b64 	%rd14, %rd13, 2;
	add.s64 	%rd15, %rd1, %rd14;
	ld.global.nc.f32 	%f14, [%rd15];
	add.ftz.f32 	%f47, %f14, 0f00000000;
	add.s32 	%r78, %r78, 1;

BB11_10:
	add.s32 	%r63, %r78, %r18;
	cvt.s64.s32	%rd16, %r63;
	add.s64 	%rd17, %rd16, %rd32;
	shl.b64 	%rd18, %rd17, 2;
	add.s64 	%rd19, %rd1, %rd18;
	ld.global.nc.f32 	%f15, [%rd19];
	add.ftz.f32 	%f48, %f47, %f15;
	add.s32 	%r78, %r78, 1;

BB11_11:
	add.s32 	%r64, %r78, %r18;
	cvt.s64.s32	%rd20, %r64;
	add.s64 	%rd21, %rd20, %rd32;
	shl.b64 	%rd22, %rd21, 2;
	add.s64 	%rd23, %rd1, %rd22;
	ld.global.nc.f32 	%f16, [%rd23];
	add.ftz.f32 	%f51, %f48, %f16;
	add.s32 	%r78, %r78, 1;

BB11_12:
	setp.lt.u32	%p15, %r19, 4;
	@%p15 bra 	BB11_15;

	mad.lo.s32 	%r65, %r72, %r31, %r78;
	cvt.s64.s32	%rd24, %r65;
	add.s64 	%rd25, %rd32, %rd24;
	shl.b64 	%rd26, %rd25, 2;
	add.s64 	%rd33, %rd1, %rd26;

BB11_14:
	ld.global.nc.f32 	%f17, [%rd33];
	add.ftz.f32 	%f18, %f51, %f17;
	ld.global.nc.f32 	%f19, [%rd33+4];
	add.ftz.f32 	%f20, %f18, %f19;
	ld.global.nc.f32 	%f21, [%rd33+8];
	add.ftz.f32 	%f22, %f20, %f21;
	ld.global.nc.f32 	%f23, [%rd33+12];
	add.ftz.f32 	%f51, %f22, %f23;
	add.s64 	%rd33, %rd33, 16;
	add.s32 	%r78, %r78, 4;
	setp.lt.s32	%p16, %r78, %r17;
	@%p16 bra 	BB11_14;

BB11_15:
	cvt.rn.f32.s32	%f26, %r19;
	rcp.approx.ftz.f32 	%f27, %f26;
	mul.ftz.f32 	%f28, %f51, %f27;
	max.ftz.f32 	%f30, %f28, %f10;
	neg.ftz.f32 	%f31, %f30;
	mov.f32 	%f32, 0f3F317218;
	div.approx.ftz.f32 	%f33, %f31, %f32;
	mov.f32 	%f34, 0fC2FC0000;
	max.ftz.f32 	%f35, %f33, %f34;
	mov.f32 	%f36, 0f42FC0000;
	min.ftz.f32 	%f37, %f35, %f36;
	cvt.rzi.ftz.s32.f32	%r69, %f37;
	cvt.rn.f32.s32	%f38, %r69;
	sub.ftz.f32 	%f39, %f37, %f38;
	mov.f32 	%f40, 0f3F800000;
	sub.ftz.f32 	%f41, %f40, %f39;
	sub.ftz.f32 	%f42, %f40, %f41;
	fma.rn.ftz.f32 	%f43, %f42, 0f3AAEC44E, 0f3C20BB9A;
	fma.rn.ftz.f32 	%f44, %f42, %f43, 0f3D636733;
	fma.rn.ftz.f32 	%f45, %f42, %f44, 0f3E75F192;
	fma.rn.ftz.f32 	%f46, %f42, %f45, 0f3F3171F1;
	fma.rn.ftz.f32 	%f24, %f42, %f46, 0f3F800000;
	// inline asm
	mov.b32 	%r66, %f24;
	// inline asm
	shl.b32 	%r70, %r69, 23;
	add.s32 	%r67, %r66, %r70;
	// inline asm
	mov.b32 	%f25, %r67;
	// inline asm
	add.s32 	%r71, %r18, %r73;
	cvt.s64.s32	%rd27, %r71;
	add.s64 	%rd28, %rd32, %rd27;
	cvta.to.global.u64 	%rd29, %rd7;
	shl.b64 	%rd30, %rd28, 2;
	add.s64 	%rd31, %rd29, %rd30;
	st.global.f32 	[%rd31], %f25;

BB11_16:
	ret;
}

	// .globl	kernel_cuda_filter_nlm_update_output
.visible .entry kernel_cuda_filter_nlm_update_output(
	.param .u64 kernel_cuda_filter_nlm_update_output_param_0,
	.param .u64 kernel_cuda_filter_nlm_update_output_param_1,
	.param .u64 kernel_cuda_filter_nlm_update_output_param_2,
	.param .u64 kernel_cuda_filter_nlm_update_output_param_3,
	.param .u32 kernel_cuda_filter_nlm_update_output_param_4,
	.param .u32 kernel_cuda_filter_nlm_update_output_param_5,
	.param .u32 kernel_cuda_filter_nlm_update_output_param_6,
	.param .u32 kernel_cuda_filter_nlm_update_output_param_7,
	.param .u32 kernel_cuda_filter_nlm_update_output_param_8,
	.param .u32 kernel_cuda_filter_nlm_update_output_param_9,
	.param .u32 kernel_cuda_filter_nlm_update_output_param_10
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<43>;
	.reg .b32 	%r<87>;
	.reg .b64 	%rd<48>;


	ld.param.u64 	%rd7, [kernel_cuda_filter_nlm_update_output_param_0];
	ld.param.u64 	%rd8, [kernel_cuda_filter_nlm_update_output_param_1];
	ld.param.u64 	%rd9, [kernel_cuda_filter_nlm_update_output_param_2];
	ld.param.u64 	%rd10, [kernel_cuda_filter_nlm_update_output_param_3];
	ld.param.u32 	%r35, [kernel_cuda_filter_nlm_update_output_param_4];
	ld.param.u32 	%r36, [kernel_cuda_filter_nlm_update_output_param_5];
	ld.param.u32 	%r37, [kernel_cuda_filter_nlm_update_output_param_6];
	ld.param.u32 	%r38, [kernel_cuda_filter_nlm_update_output_param_7];
	ld.param.u32 	%r39, [kernel_cuda_filter_nlm_update_output_param_8];
	ld.param.u32 	%r40, [kernel_cuda_filter_nlm_update_output_param_9];
	ld.param.u32 	%r41, [kernel_cuda_filter_nlm_update_output_param_10];
	shl.b32 	%r43, %r40, 1;
	add.s32 	%r1, %r43, 1;
	mov.u32 	%r44, %ntid.y;
	mov.u32 	%r45, %ctaid.y;
	mov.u32 	%r46, %tid.y;
	mad.lo.s32 	%r47, %r44, %r45, %r46;
	rem.s32 	%r2, %r47, %r1;
	div.s32 	%r3, %r47, %r1;
	mov.pred 	%p19, -1;
	mov.u64 	%rd46, 0;
	setp.ge.s32	%p3, %r3, %r1;
	@%p3 bra 	BB12_4;

	sub.s32 	%r78, %r2, %r40;
	neg.s32 	%r49, %r78;
	mov.u32 	%r50, 0;
	max.s32 	%r82, %r50, %r49;
	sub.s32 	%r77, %r3, %r40;
	neg.s32 	%r51, %r77;
	max.s32 	%r7, %r50, %r51;
	max.s32 	%r52, %r50, %r78;
	sub.s32 	%r81, %r35, %r52;
	max.s32 	%r53, %r50, %r77;
	sub.s32 	%r54, %r36, %r53;
	min.s32 	%r9, %r35, %r81;
	min.s32 	%r10, %r36, %r54;
	setp.le.s32	%p5, %r10, %r7;
	setp.le.s32	%p6, %r9, %r82;
	or.pred  	%p7, %p5, %p6;
	@%p7 bra 	BB12_4;

	mov.u32 	%r56, %ctaid.x;
	mov.u32 	%r57, %ntid.x;
	mov.u32 	%r58, %tid.x;
	mad.lo.s32 	%r59, %r57, %r56, %r58;
	sub.s32 	%r60, %r9, %r82;
	rem.s32 	%r11, %r59, %r60;
	div.s32 	%r61, %r59, %r60;
	add.s32 	%r79, %r61, %r7;
	setp.ge.s32	%p9, %r79, %r10;
	@%p9 bra 	BB12_4;

	mad.lo.s32 	%r62, %r3, %r1, %r2;
	mul.lo.s32 	%r63, %r62, %r38;
	cvt.s64.s32	%rd46, %r63;
	add.s32 	%r80, %r11, %r82;
	mov.pred 	%p19, 0;

BB12_4:
	@%p19 bra 	BB12_20;

	sub.s32 	%r64, %r80, %r41;
	max.s32 	%r85, %r82, %r64;
	add.s32 	%r65, %r41, %r80;
	add.s32 	%r66, %r65, 1;
	min.s32 	%r21, %r81, %r66;
	mul.lo.s32 	%r22, %r79, %r37;
	sub.s32 	%r23, %r21, %r85;
	mov.f32 	%f41, 0f00000000;
	setp.le.s32	%p11, %r21, %r85;
	@%p11 bra 	BB12_15;

	and.b32  	%r24, %r23, 3;
	setp.eq.s32	%p12, %r24, 0;
	mov.f32 	%f41, 0f00000000;
	@%p12 bra 	BB12_12;

	setp.eq.s32	%p13, %r24, 1;
	mov.f32 	%f38, 0f00000000;
	@%p13 bra 	BB12_11;

	setp.eq.s32	%p14, %r24, 2;
	mov.f32 	%f37, 0f00000000;
	@%p14 bra 	BB12_10;

	add.s32 	%r67, %r85, %r22;
	cvt.s64.s32	%rd14, %r67;
	add.s64 	%rd15, %rd14, %rd46;
	cvta.to.global.u64 	%rd16, %rd7;
	shl.b64 	%rd17, %rd15, 2;
	add.s64 	%rd18, %rd16, %rd17;
	ld.global.nc.f32 	%f18, [%rd18];
	add.ftz.f32 	%f37, %f18, 0f00000000;
	add.s32 	%r85, %r85, 1;

BB12_10:
	add.s32 	%r68, %r85, %r22;
	cvt.s64.s32	%rd19, %r68;
	add.s64 	%rd20, %rd19, %rd46;
	cvta.to.global.u64 	%rd21, %rd7;
	shl.b64 	%rd22, %rd20, 2;
	add.s64 	%rd23, %rd21, %rd22;
	ld.global.nc.f32 	%f19, [%rd23];
	add.ftz.f32 	%f38, %f37, %f19;
	add.s32 	%r85, %r85, 1;

BB12_11:
	add.s32 	%r69, %r85, %r22;
	cvt.s64.s32	%rd24, %r69;
	add.s64 	%rd25, %rd24, %rd46;
	cvta.to.global.u64 	%rd26, %rd7;
	shl.b64 	%rd27, %rd25, 2;
	add.s64 	%rd28, %rd26, %rd27;
	ld.global.nc.f32 	%f20, [%rd28];
	add.ftz.f32 	%f41, %f38, %f20;
	add.s32 	%r85, %r85, 1;

BB12_12:
	setp.lt.u32	%p15, %r23, 4;
	@%p15 bra 	BB12_15;

	mad.lo.s32 	%r70, %r79, %r37, %r85;
	cvt.s64.s32	%rd29, %r70;
	add.s64 	%rd30, %rd46, %rd29;
	cvta.to.global.u64 	%rd31, %rd7;
	shl.b64 	%rd32, %rd30, 2;
	add.s64 	%rd47, %rd31, %rd32;

BB12_14:
	ld.global.nc.f32 	%f21, [%rd47];
	add.ftz.f32 	%f22, %f41, %f21;
	ld.global.nc.f32 	%f23, [%rd47+4];
	add.ftz.f32 	%f24, %f22, %f23;
	ld.global.nc.f32 	%f25, [%rd47+8];
	add.ftz.f32 	%f26, %f24, %f25;
	ld.global.nc.f32 	%f27, [%rd47+12];
	add.ftz.f32 	%f41, %f26, %f27;
	add.s64 	%rd47, %rd47, 16;
	add.s32 	%r85, %r85, 4;
	setp.lt.s32	%p16, %r85, %r21;
	@%p16 bra 	BB12_14;

BB12_15:
	cvt.rn.f32.s32	%f28, %r23;
	rcp.approx.ftz.f32 	%f29, %f28;
	mul.ftz.f32 	%f10, %f41, %f29;
	add.s32 	%r72, %r79, %r77;
	add.s32 	%r73, %r80, %r78;
	mad.lo.s32 	%r33, %r72, %r37, %r73;
	add.s32 	%r34, %r22, %r80;
	cvta.to.global.u64 	%rd33, %rd10;
	mul.wide.s32 	%rd34, %r34, 4;
	add.s64 	%rd6, %rd33, %rd34;
	setp.eq.s64	%p17, %rd9, 0;
	@%p17 bra 	BB12_19;

	atom.global.add.f32 	%f30, [%rd6], %f10;
	cvta.to.global.u64 	%rd35, %rd8;
	mul.wide.s32 	%rd36, %r33, 4;
	add.s64 	%rd37, %rd35, %rd36;
	ld.global.nc.f32 	%f42, [%rd37];
	setp.eq.s32	%p18, %r39, 0;
	@%p18 bra 	BB12_18;

	add.s32 	%r74, %r33, %r39;
	mul.wide.s32 	%rd39, %r74, 4;
	add.s64 	%rd40, %rd35, %rd39;
	ld.global.nc.f32 	%f31, [%rd40];
	add.ftz.f32 	%f32, %f42, %f31;
	shl.b32 	%r75, %r39, 1;
	add.s32 	%r76, %r33, %r75;
	mul.wide.s32 	%rd41, %r76, 4;
	add.s64 	%rd42, %rd35, %rd41;
	ld.global.nc.f32 	%f33, [%rd42];
	add.ftz.f32 	%f34, %f32, %f33;
	mul.ftz.f32 	%f42, %f34, 0f3EAAAAAB;

BB12_18:
	cvta.to.global.u64 	%rd43, %rd9;
	add.s64 	%rd45, %rd43, %rd34;
	mul.ftz.f32 	%f35, %f10, %f42;
	atom.global.add.f32 	%f36, [%rd45], %f35;
	bra.uni 	BB12_20;

BB12_19:
	st.global.f32 	[%rd6], %f10;

BB12_20:
	ret;
}

	// .globl	kernel_cuda_filter_nlm_normalize
.visible .entry kernel_cuda_filter_nlm_normalize(
	.param .u64 kernel_cuda_filter_nlm_normalize_param_0,
	.param .u64 kernel_cuda_filter_nlm_normalize_param_1,
	.param .u32 kernel_cuda_filter_nlm_normalize_param_2,
	.param .u32 kernel_cuda_filter_nlm_normalize_param_3,
	.param .u32 kernel_cuda_filter_nlm_normalize_param_4
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<4>;
	.reg .f32 	%f<4>;
	.reg .b32 	%r<13>;
	.reg .b64 	%rd<8>;


	ld.param.u64 	%rd1, [kernel_cuda_filter_nlm_normalize_param_0];
	ld.param.u64 	%rd2, [kernel_cuda_filter_nlm_normalize_param_1];
	ld.param.u32 	%r4, [kernel_cuda_filter_nlm_normalize_param_2];
	ld.param.u32 	%r5, [kernel_cuda_filter_nlm_normalize_param_3];
	ld.param.u32 	%r3, [kernel_cuda_filter_nlm_normalize_param_4];
	mov.u32 	%r6, %ntid.x;
	mov.u32 	%r7, %ctaid.x;
	mov.u32 	%r8, %tid.x;
	mad.lo.s32 	%r1, %r7, %r6, %r8;
	mov.u32 	%r9, %ctaid.y;
	mov.u32 	%r10, %ntid.y;
	mov.u32 	%r11, %tid.y;
	mad.lo.s32 	%r2, %r9, %r10, %r11;
	setp.ge.s32	%p1, %r1, %r4;
	setp.ge.s32	%p2, %r2, %r5;
	or.pred  	%p3, %p1, %p2;
	@%p3 bra 	BB13_2;

	cvta.to.global.u64 	%rd3, %rd1;
	mad.lo.s32 	%r12, %r2, %r3, %r1;
	mul.wide.s32 	%rd4, %r12, 4;
	add.s64 	%rd5, %rd3, %rd4;
	cvta.to.global.u64 	%rd6, %rd2;
	add.s64 	%rd7, %rd6, %rd4;
	ld.global.nc.f32 	%f1, [%rd7];
	ld.global.f32 	%f2, [%rd5];
	div.approx.ftz.f32 	%f3, %f2, %f1;
	st.global.f32 	[%rd5], %f3;

BB13_2:
	ret;
}

	// .globl	kernel_cuda_filter_nlm_construct_gramian
.visible .entry kernel_cuda_filter_nlm_construct_gramian(
	.param .u32 kernel_cuda_filter_nlm_construct_gramian_param_0,
	.param .u64 kernel_cuda_filter_nlm_construct_gramian_param_1,
	.param .u64 kernel_cuda_filter_nlm_construct_gramian_param_2,
	.param .u64 kernel_cuda_filter_nlm_construct_gramian_param_3,
	.param .u64 kernel_cuda_filter_nlm_construct_gramian_param_4,
	.param .u64 kernel_cuda_filter_nlm_construct_gramian_param_5,
	.param .u64 kernel_cuda_filter_nlm_construct_gramian_param_6,
	.param .align 16 .b8 kernel_cuda_filter_nlm_construct_gramian_param_7[16],
	.param .u32 kernel_cuda_filter_nlm_construct_gramian_param_8,
	.param .u32 kernel_cuda_filter_nlm_construct_gramian_param_9,
	.param .u32 kernel_cuda_filter_nlm_construct_gramian_param_10,
	.param .u32 kernel_cuda_filter_nlm_construct_gramian_param_11,
	.param .u32 kernel_cuda_filter_nlm_construct_gramian_param_12,
	.param .u32 kernel_cuda_filter_nlm_construct_gramian_param_13,
	.param .u32 kernel_cuda_filter_nlm_construct_gramian_param_14,
	.param .u8 kernel_cuda_filter_nlm_construct_gramian_param_15
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<130>;
	.reg .b16 	%rs<25>;
	.reg .f32 	%f<375>;
	.reg .b32 	%r<1115>;
	.reg .b64 	%rd<236>;
	// demoted variable
	.shared .align 4 .b8 _ZZ31kernel_filter_construct_gramianiiiiiiiiibPKfS0_PifPfP6float3iE17shared_design_row[12288];

	ld.param.u64 	%rd30, [kernel_cuda_filter_nlm_construct_gramian_param_1];
	ld.param.u64 	%rd31, [kernel_cuda_filter_nlm_construct_gramian_param_2];
	ld.param.u64 	%rd32, [kernel_cuda_filter_nlm_construct_gramian_param_3];
	ld.param.u64 	%rd27, [kernel_cuda_filter_nlm_construct_gramian_param_4];
	ld.param.u64 	%rd28, [kernel_cuda_filter_nlm_construct_gramian_param_5];
	ld.param.u64 	%rd29, [kernel_cuda_filter_nlm_construct_gramian_param_6];
	ld.param.v4.u32 	{%r299, %r300, %r301, %r302}, [kernel_cuda_filter_nlm_construct_gramian_param_7];
	ld.param.u32 	%r291, [kernel_cuda_filter_nlm_construct_gramian_param_8];
	ld.param.u32 	%r292, [kernel_cuda_filter_nlm_construct_gramian_param_9];
	ld.param.u32 	%r293, [kernel_cuda_filter_nlm_construct_gramian_param_10];
	ld.param.u32 	%r294, [kernel_cuda_filter_nlm_construct_gramian_param_11];
	ld.param.u32 	%r295, [kernel_cuda_filter_nlm_construct_gramian_param_12];
	ld.param.u32 	%r296, [kernel_cuda_filter_nlm_construct_gramian_param_13];
	ld.param.u32 	%r297, [kernel_cuda_filter_nlm_construct_gramian_param_14];
	ld.param.s8 	%rs1, [kernel_cuda_filter_nlm_construct_gramian_param_15];
	cvta.to.global.u64 	%rd1, %rd30;
	cvta.to.global.u64 	%rd2, %rd32;
	cvta.to.global.u64 	%rd3, %rd31;
	shl.b32 	%r303, %r295, 1;
	add.s32 	%r1, %r303, 1;
	mov.u32 	%r304, %ntid.y;
	mov.u32 	%r305, %ctaid.y;
	mov.u32 	%r2, %tid.y;
	mad.lo.s32 	%r306, %r304, %r305, %r2;
	rem.s32 	%r3, %r306, %r1;
	div.s32 	%r4, %r306, %r1;
	mov.pred 	%p129, -1;
	setp.ge.s32	%p3, %r4, %r1;
	@%p3 bra 	BB14_4;

	sub.s32 	%r1020, %r3, %r295;
	neg.s32 	%r308, %r1020;
	mov.u32 	%r309, 0;
	max.s32 	%r1024, %r309, %r308;
	sub.s32 	%r1019, %r4, %r295;
	neg.s32 	%r310, %r1019;
	max.s32 	%r311, %r309, %r310;
	max.s32 	%r312, %r309, %r1020;
	sub.s32 	%r1023, %r291, %r312;
	max.s32 	%r313, %r309, %r1019;
	sub.s32 	%r314, %r292, %r313;
	max.s32 	%r9, %r299, %r1024;
	max.s32 	%r10, %r300, %r311;
	min.s32 	%r11, %r301, %r1023;
	min.s32 	%r12, %r302, %r314;
	setp.le.s32	%p5, %r12, %r10;
	setp.le.s32	%p6, %r11, %r9;
	or.pred  	%p7, %p5, %p6;
	@%p7 bra 	BB14_4;

	mov.u32 	%r316, %ctaid.x;
	mov.u32 	%r317, %ntid.x;
	mov.u32 	%r318, %tid.x;
	mad.lo.s32 	%r319, %r317, %r316, %r318;
	sub.s32 	%r320, %r11, %r9;
	rem.s32 	%r321, %r319, %r320;
	add.s32 	%r1022, %r321, %r9;
	div.s32 	%r322, %r319, %r320;
	add.s32 	%r1021, %r322, %r10;
	setp.ge.s32	%p9, %r1021, %r12;
	@%p9 bra 	BB14_4;

	mad.lo.s32 	%r323, %r4, %r1, %r3;
	mul.lo.s32 	%r1025, %r323, %r294;
	mov.pred 	%p129, 0;

BB14_4:
	@%p129 bra 	BB14_161;

	sub.s32 	%r324, %r1022, %r296;
	max.s32 	%r1028, %r1024, %r324;
	add.s32 	%r325, %r296, %r1022;
	add.s32 	%r326, %r325, 1;
	min.s32 	%r24, %r1023, %r326;
	sub.s32 	%r25, %r24, %r1028;
	mov.f32 	%f374, 0f00000000;
	setp.le.s32	%p11, %r24, %r1028;
	@%p11 bra 	BB14_15;

	mul.lo.s32 	%r26, %r1021, %r293;
	and.b32  	%r27, %r25, 3;
	setp.eq.s32	%p12, %r27, 0;
	mov.f32 	%f374, 0f00000000;
	@%p12 bra 	BB14_12;

	setp.eq.s32	%p13, %r27, 1;
	mov.f32 	%f371, 0f00000000;
	@%p13 bra 	BB14_11;

	setp.eq.s32	%p14, %r27, 2;
	mov.f32 	%f370, 0f00000000;
	@%p14 bra 	BB14_10;

	add.s32 	%r327, %r1028, %r26;
	add.s32 	%r328, %r327, %r1025;
	mul.wide.s32 	%rd33, %r328, 4;
	add.s64 	%rd34, %rd1, %rd33;
	ld.global.nc.f32 	%f33, [%rd34];
	add.ftz.f32 	%f370, %f33, 0f00000000;
	add.s32 	%r1028, %r1028, 1;

BB14_10:
	add.s32 	%r329, %r1028, %r26;
	add.s32 	%r330, %r329, %r1025;
	mul.wide.s32 	%rd35, %r330, 4;
	add.s64 	%rd36, %rd1, %rd35;
	ld.global.nc.f32 	%f34, [%rd36];
	add.ftz.f32 	%f371, %f370, %f34;
	add.s32 	%r1028, %r1028, 1;

BB14_11:
	add.s32 	%r331, %r1028, %r26;
	add.s32 	%r332, %r331, %r1025;
	mul.wide.s32 	%rd37, %r332, 4;
	add.s64 	%rd38, %rd1, %rd37;
	ld.global.nc.f32 	%f35, [%rd38];
	add.ftz.f32 	%f374, %f371, %f35;
	add.s32 	%r1028, %r1028, 1;

BB14_12:
	setp.lt.u32	%p15, %r25, 4;
	@%p15 bra 	BB14_15;

	mad.lo.s32 	%r333, %r1021, %r293, %r1028;
	add.s32 	%r334, %r1025, %r333;
	mul.wide.s32 	%rd39, %r334, 4;
	add.s64 	%rd235, %rd1, %rd39;

BB14_14:
	ld.global.nc.f32 	%f36, [%rd235];
	add.ftz.f32 	%f37, %f374, %f36;
	ld.global.nc.f32 	%f38, [%rd235+4];
	add.ftz.f32 	%f39, %f37, %f38;
	ld.global.nc.f32 	%f40, [%rd235+8];
	add.ftz.f32 	%f41, %f39, %f40;
	ld.global.nc.f32 	%f42, [%rd235+12];
	add.ftz.f32 	%f374, %f41, %f42;
	add.s64 	%rd235, %rd235, 16;
	add.s32 	%r1028, %r1028, 4;
	setp.lt.s32	%p16, %r1028, %r24;
	@%p16 bra 	BB14_14;

BB14_15:
	cvta.to.global.u64 	%rd7, %rd27;
	cvt.rn.f32.s32	%f43, %r25;
	rcp.approx.ftz.f32 	%f44, %f43;
	mul.ftz.f32 	%f10, %f374, %f44;
	sub.s32 	%r336, %r301, %r299;
	sub.s32 	%r337, %r1021, %r300;
	sub.s32 	%r338, %r1022, %r299;
	mad.lo.s32 	%r39, %r337, %r336, %r338;
	sub.s32 	%r339, %r302, %r300;
	mul.lo.s32 	%r41, %r336, %r339;
	setp.lt.ftz.f32	%p17, %f10, 0f3A83126F;
	@%p17 bra 	BB14_161;

	mad.lo.s32 	%r42, %r1021, %r293, %r1022;
	add.s32 	%r340, %r1021, %r1019;
	add.s32 	%r341, %r1020, %r297;
	add.s32 	%r342, %r341, %r1022;
	mad.lo.s32 	%r343, %r340, %r293, %r342;
	shl.b32 	%r344, %r294, 3;
	add.s32 	%r345, %r343, %r344;
	mul.wide.s32 	%rd40, %r345, 4;
	add.s64 	%rd8, %rd3, %rd40;
	shl.b32 	%r43, %r294, 2;
	cvt.s64.s32	%rd41, %r43;
	add.s64 	%rd9, %rd8, %rd41;
	mul.wide.s32 	%rd42, %r343, 4;
	add.s64 	%rd10, %rd3, %rd42;
	ld.global.nc.f32 	%f11, [%rd10];
	setp.lt.ftz.f32	%p18, %f11, 0f00000000;
	@%p18 bra 	BB14_161;

	mul.wide.s32 	%rd43, %r39, 4;
	add.s64 	%rd11, %rd7, %rd43;
	mov.u32 	%r346, %tid.x;
	mov.u32 	%r347, %ntid.x;
	mad.lo.s32 	%r348, %r347, %r2, %r346;
	mul.lo.s32 	%r349, %r348, 12;
	ld.global.nc.f32 	%f12, [%rd8];
	ld.global.nc.f32 	%f13, [%rd9];
	add.s64 	%rd45, %rd9, %rd41;
	ld.global.nc.f32 	%f14, [%rd45];
	shl.b32 	%r350, %r349, 2;
	mov.u32 	%r351, _ZZ31kernel_filter_construct_gramianiiiiiiiiibPKfS0_PifPfP6float3iE17shared_design_row;
	add.s32 	%r352, %r351, %r350;
	mul.wide.s32 	%rd46, %r42, 4;
	add.s64 	%rd12, %rd3, %rd46;
	ld.global.u32 	%r1108, [%rd11];
	mov.u32 	%r353, 1065353216;
	st.shared.u32 	[%r352], %r353;
	add.s32 	%r45, %r349, 1;
	setp.lt.s32	%p19, %r1108, 1;
	@%p19 bra 	BB14_27;

	and.b32  	%r46, %r1108, 3;
	setp.eq.s32	%p20, %r46, 0;
	mov.u32 	%r1034, 0;
	@%p20 bra 	BB14_24;

	setp.eq.s32	%p21, %r46, 1;
	mov.u32 	%r355, 0;
	mov.u32 	%r1031, %r355;
	@%p21 bra 	BB14_23;

	setp.eq.s32	%p22, %r46, 2;
	mov.u32 	%r356, 0;
	mov.u32 	%r1030, %r356;
	@%p22 bra 	BB14_22;

	mov.u32 	%r366, 0;
	st.shared.u32 	[%r352+4], %r366;
	mov.u32 	%r1030, 1;

BB14_22:
	add.s32 	%r367, %r45, %r1030;
	shl.b32 	%r368, %r367, 2;
	add.s32 	%r370, %r351, %r368;
	st.shared.u32 	[%r370], %r356;
	add.s32 	%r1031, %r1030, 1;

BB14_23:
	add.s32 	%r372, %r45, %r1031;
	shl.b32 	%r373, %r372, 2;
	add.s32 	%r375, %r351, %r373;
	st.shared.u32 	[%r375], %r355;
	add.s32 	%r1034, %r1031, 1;

BB14_24:
	setp.lt.u32	%p23, %r1108, 4;
	@%p23 bra 	BB14_27;

	mul.lo.s32 	%r379, %r347, %r2;
	mad.lo.s32 	%r380, %r379, 12, %r1034;
	mad.lo.s32 	%r382, %r346, 12, %r380;
	shl.b32 	%r383, %r382, 2;
	add.s32 	%r1033, %r351, %r383;

BB14_26:
	mov.u32 	%r385, 0;
	st.shared.u32 	[%r1033+4], %r385;
	st.shared.u32 	[%r1033+16], %r385;
	mov.u64 	%rd47, 0;
	st.shared.u32 	[%r1033+12], %rd47;
	st.shared.u32 	[%r1033+8], %rd47;
	add.s32 	%r1033, %r1033, 16;
	add.s32 	%r1034, %r1034, 4;
	setp.lt.s32	%p24, %r1034, %r1108;
	@%p24 bra 	BB14_26;

BB14_27:
	cvt.rn.f32.s32	%f15, %r1020;
	@%p19 bra 	BB14_37;

	and.b32  	%r57, %r1108, 3;
	setp.eq.s32	%p26, %r57, 0;
	mov.u32 	%r1040, 0;
	@%p26 bra 	BB14_34;

	setp.eq.s32	%p27, %r57, 1;
	mov.u32 	%r1036, 0;
	@%p27 bra 	BB14_33;

	setp.eq.s32	%p28, %r57, 2;
	mov.u32 	%r1035, 0;
	@%p28 bra 	BB14_32;

	add.s64 	%rd49, %rd2, %rd43;
	ld.global.nc.f32 	%f45, [%rd49];
	ld.shared.f32 	%f46, [%r352+4];
	fma.rn.ftz.f32 	%f47, %f15, %f45, %f46;
	st.shared.f32 	[%r352+4], %f47;
	mov.u32 	%r1035, 1;

BB14_32:
	neg.s32 	%r398, %r1035;
	and.b32  	%r399, %r41, %r398;
	add.s32 	%r400, %r399, %r39;
	mul.wide.s32 	%rd50, %r400, 4;
	add.s64 	%rd51, %rd2, %rd50;
	ld.global.nc.f32 	%f48, [%rd51];
	add.s32 	%r1036, %r1035, 1;
	mad.lo.s32 	%r405, %r348, 12, %r1035;
	shl.b32 	%r406, %r405, 2;
	add.s32 	%r408, %r406, %r351;
	ld.shared.f32 	%f49, [%r408+4];
	fma.rn.ftz.f32 	%f50, %f15, %f48, %f49;
	st.shared.f32 	[%r408+4], %f50;

BB14_33:
	mad.lo.s32 	%r409, %r1036, %r41, %r39;
	mul.wide.s32 	%rd52, %r409, 4;
	add.s64 	%rd53, %rd2, %rd52;
	ld.global.nc.f32 	%f51, [%rd53];
	add.s32 	%r1040, %r1036, 1;
	mad.lo.s32 	%r414, %r348, 12, %r1036;
	shl.b32 	%r415, %r414, 2;
	add.s32 	%r417, %r415, %r351;
	ld.shared.f32 	%f52, [%r417+4];
	fma.rn.ftz.f32 	%f53, %f15, %f51, %f52;
	st.shared.f32 	[%r417+4], %f53;

BB14_34:
	setp.lt.u32	%p29, %r1108, 4;
	@%p29 bra 	BB14_37;

	mul.lo.s32 	%r420, %r347, %r2;
	mad.lo.s32 	%r421, %r420, 12, %r1040;
	mad.lo.s32 	%r423, %r346, 12, %r421;
	shl.b32 	%r424, %r423, 2;
	add.s32 	%r1039, %r351, %r424;
	shl.b32 	%r64, %r41, 2;
	mul.lo.s32 	%r429, %r1040, %r336;
	mul.lo.s32 	%r1038, %r429, %r339;

BB14_36:
	add.s32 	%r430, %r1038, %r39;
	mul.wide.s32 	%rd54, %r430, 4;
	add.s64 	%rd55, %rd2, %rd54;
	ld.global.nc.f32 	%f54, [%rd55];
	ld.shared.f32 	%f55, [%r1039+4];
	fma.rn.ftz.f32 	%f56, %f15, %f54, %f55;
	ld.shared.f32 	%f57, [%r1039+8];
	ld.shared.f32 	%f58, [%r1039+12];
	ld.shared.f32 	%f59, [%r1039+16];
	st.shared.f32 	[%r1039+4], %f56;
	cvt.s64.s32	%rd56, %r64;
	add.s64 	%rd57, %rd55, %rd56;
	ld.global.nc.f32 	%f60, [%rd57];
	fma.rn.ftz.f32 	%f61, %f15, %f60, %f57;
	st.shared.f32 	[%r1039+8], %f61;
	add.s64 	%rd58, %rd57, %rd56;
	ld.global.nc.f32 	%f62, [%rd58];
	fma.rn.ftz.f32 	%f63, %f15, %f62, %f58;
	st.shared.f32 	[%r1039+12], %f63;
	add.s64 	%rd59, %rd58, %rd56;
	add.s32 	%r70, %r1039, 16;
	ld.global.nc.f32 	%f64, [%rd59];
	fma.rn.ftz.f32 	%f65, %f15, %f64, %f59;
	st.shared.f32 	[%r1039+16], %f65;
	add.s32 	%r1038, %r1038, %r64;
	add.s32 	%r1040, %r1040, 4;
	setp.lt.s32	%p30, %r1040, %r1108;
	mov.u32 	%r1039, %r70;
	@%p30 bra 	BB14_36;

BB14_37:
	cvt.rn.f32.s32	%f16, %r1019;
	@%p19 bra 	BB14_47;

	and.b32  	%r73, %r1108, 3;
	setp.eq.s32	%p32, %r73, 0;
	mov.u32 	%r1046, 0;
	@%p32 bra 	BB14_44;

	setp.eq.s32	%p33, %r73, 1;
	mov.u32 	%r1042, 0;
	@%p33 bra 	BB14_43;

	setp.eq.s32	%p34, %r73, 2;
	mov.u32 	%r1041, 0;
	@%p34 bra 	BB14_42;

	and.b16  	%rs2, %rs1, 255;
	setp.eq.s16	%p35, %rs2, 0;
	selp.b32	%r435, 10, 11, %p35;
	mad.lo.s32 	%r436, %r435, %r41, %r39;
	mul.wide.s32 	%rd60, %r436, 4;
	add.s64 	%rd61, %rd2, %rd60;
	ld.global.nc.f32 	%f66, [%rd61];
	ld.shared.f32 	%f67, [%r352+4];
	fma.rn.ftz.f32 	%f68, %f16, %f66, %f67;
	st.shared.f32 	[%r352+4], %f68;
	mov.u32 	%r1041, 1;

BB14_42:
	and.b16  	%rs3, %rs1, 255;
	setp.eq.s16	%p36, %rs3, 0;
	selp.b32	%r445, 10, 11, %p36;
	add.s32 	%r446, %r1041, %r445;
	mad.lo.s32 	%r447, %r446, %r41, %r39;
	mul.wide.s32 	%rd62, %r447, 4;
	add.s64 	%rd63, %rd2, %rd62;
	ld.global.nc.f32 	%f69, [%rd63];
	add.s32 	%r1042, %r1041, 1;
	mad.lo.s32 	%r452, %r348, 12, %r1041;
	shl.b32 	%r453, %r452, 2;
	add.s32 	%r455, %r453, %r351;
	ld.shared.f32 	%f70, [%r455+4];
	fma.rn.ftz.f32 	%f71, %f16, %f69, %f70;
	st.shared.f32 	[%r455+4], %f71;

BB14_43:
	and.b16  	%rs4, %rs1, 255;
	setp.eq.s16	%p37, %rs4, 0;
	selp.b32	%r456, 10, 11, %p37;
	add.s32 	%r457, %r1042, %r456;
	mad.lo.s32 	%r458, %r457, %r41, %r39;
	mul.wide.s32 	%rd64, %r458, 4;
	add.s64 	%rd65, %rd2, %rd64;
	ld.global.nc.f32 	%f72, [%rd65];
	add.s32 	%r1046, %r1042, 1;
	mad.lo.s32 	%r463, %r348, 12, %r1042;
	shl.b32 	%r464, %r463, 2;
	add.s32 	%r466, %r464, %r351;
	ld.shared.f32 	%f73, [%r466+4];
	fma.rn.ftz.f32 	%f74, %f16, %f72, %f73;
	st.shared.f32 	[%r466+4], %f74;

BB14_44:
	setp.lt.u32	%p38, %r1108, 4;
	@%p38 bra 	BB14_47;

	mul.lo.s32 	%r469, %r347, %r2;
	mad.lo.s32 	%r470, %r469, 12, %r1046;
	mad.lo.s32 	%r472, %r346, 12, %r470;
	shl.b32 	%r473, %r472, 2;
	add.s32 	%r1045, %r351, %r473;
	shl.b32 	%r80, %r41, 2;
	and.b16  	%rs5, %rs1, 255;
	setp.eq.s16	%p39, %rs5, 0;
	selp.b32	%r478, 10, 11, %p39;
	add.s32 	%r479, %r478, %r1046;
	mul.lo.s32 	%r480, %r479, %r336;
	mul.lo.s32 	%r1044, %r480, %r339;

BB14_46:
	add.s32 	%r481, %r1044, %r39;
	mul.wide.s32 	%rd66, %r481, 4;
	add.s64 	%rd67, %rd2, %rd66;
	ld.global.nc.f32 	%f75, [%rd67];
	ld.shared.f32 	%f76, [%r1045+4];
	fma.rn.ftz.f32 	%f77, %f16, %f75, %f76;
	ld.shared.f32 	%f78, [%r1045+8];
	ld.shared.f32 	%f79, [%r1045+12];
	ld.shared.f32 	%f80, [%r1045+16];
	st.shared.f32 	[%r1045+4], %f77;
	cvt.s64.s32	%rd68, %r80;
	add.s64 	%rd69, %rd67, %rd68;
	ld.global.nc.f32 	%f81, [%rd69];
	fma.rn.ftz.f32 	%f82, %f16, %f81, %f78;
	st.shared.f32 	[%r1045+8], %f82;
	add.s64 	%rd70, %rd69, %rd68;
	ld.global.nc.f32 	%f83, [%rd70];
	fma.rn.ftz.f32 	%f84, %f16, %f83, %f79;
	st.shared.f32 	[%r1045+12], %f84;
	add.s64 	%rd71, %rd70, %rd68;
	add.s32 	%r86, %r1045, 16;
	ld.global.nc.f32 	%f85, [%rd71];
	fma.rn.ftz.f32 	%f86, %f16, %f85, %f80;
	st.shared.f32 	[%r1045+16], %f86;
	add.s32 	%r1044, %r1044, %r80;
	add.s32 	%r1046, %r1046, 4;
	setp.lt.s32	%p40, %r1046, %r1108;
	mov.u32 	%r1045, %r86;
	@%p40 bra 	BB14_46;

BB14_47:
	ld.global.nc.f32 	%f87, [%rd12];
	abs.ftz.f32 	%f88, %f87;
	abs.ftz.f32 	%f89, %f11;
	sub.ftz.f32 	%f17, %f89, %f88;
	@%p19 bra 	BB14_57;

	and.b16  	%rs6, %rs1, 255;
	setp.eq.s16	%p42, %rs6, 0;
	selp.b32	%r483, 10, 11, %p42;
	shl.b32 	%r89, %r483, 1;
	and.b32  	%r90, %r1108, 3;
	setp.eq.s32	%p43, %r90, 0;
	mov.u32 	%r1052, 0;
	@%p43 bra 	BB14_54;

	setp.eq.s32	%p44, %r90, 1;
	mov.u32 	%r1048, 0;
	@%p44 bra 	BB14_53;

	setp.eq.s32	%p45, %r90, 2;
	mov.u32 	%r1047, 0;
	@%p45 bra 	BB14_52;

	mad.lo.s32 	%r487, %r89, %r41, %r39;
	mul.wide.s32 	%rd72, %r487, 4;
	add.s64 	%rd73, %rd2, %rd72;
	ld.global.nc.f32 	%f90, [%rd73];
	ld.shared.f32 	%f91, [%r352+4];
	fma.rn.ftz.f32 	%f92, %f17, %f90, %f91;
	st.shared.f32 	[%r352+4], %f92;
	mov.u32 	%r1047, 1;

BB14_52:
	add.s32 	%r496, %r1047, %r89;
	mad.lo.s32 	%r497, %r496, %r41, %r39;
	mul.wide.s32 	%rd74, %r497, 4;
	add.s64 	%rd75, %rd2, %rd74;
	ld.global.nc.f32 	%f93, [%rd75];
	add.s32 	%r1048, %r1047, 1;
	mad.lo.s32 	%r502, %r348, 12, %r1047;
	shl.b32 	%r503, %r502, 2;
	add.s32 	%r505, %r503, %r351;
	ld.shared.f32 	%f94, [%r505+4];
	fma.rn.ftz.f32 	%f95, %f17, %f93, %f94;
	st.shared.f32 	[%r505+4], %f95;

BB14_53:
	add.s32 	%r506, %r1048, %r89;
	mad.lo.s32 	%r507, %r506, %r41, %r39;
	mul.wide.s32 	%rd76, %r507, 4;
	add.s64 	%rd77, %rd2, %rd76;
	ld.global.nc.f32 	%f96, [%rd77];
	add.s32 	%r1052, %r1048, 1;
	mad.lo.s32 	%r512, %r348, 12, %r1048;
	shl.b32 	%r513, %r512, 2;
	add.s32 	%r515, %r513, %r351;
	ld.shared.f32 	%f97, [%r515+4];
	fma.rn.ftz.f32 	%f98, %f17, %f96, %f97;
	st.shared.f32 	[%r515+4], %f98;

BB14_54:
	setp.lt.u32	%p46, %r1108, 4;
	@%p46 bra 	BB14_57;

	mul.lo.s32 	%r518, %r347, %r2;
	mad.lo.s32 	%r519, %r518, 12, %r1052;
	mad.lo.s32 	%r521, %r346, 12, %r519;
	shl.b32 	%r522, %r521, 2;
	add.s32 	%r1051, %r351, %r522;
	shl.b32 	%r97, %r41, 2;
	mad.lo.s32 	%r528, %r483, 2, %r1052;
	mul.lo.s32 	%r1050, %r41, %r528;

BB14_56:
	add.s32 	%r529, %r1050, %r39;
	mul.wide.s32 	%rd78, %r529, 4;
	add.s64 	%rd79, %rd2, %rd78;
	ld.global.nc.f32 	%f99, [%rd79];
	ld.shared.f32 	%f100, [%r1051+4];
	fma.rn.ftz.f32 	%f101, %f17, %f99, %f100;
	ld.shared.f32 	%f102, [%r1051+8];
	ld.shared.f32 	%f103, [%r1051+12];
	ld.shared.f32 	%f104, [%r1051+16];
	st.shared.f32 	[%r1051+4], %f101;
	cvt.s64.s32	%rd80, %r97;
	add.s64 	%rd81, %rd79, %rd80;
	ld.global.nc.f32 	%f105, [%rd81];
	fma.rn.ftz.f32 	%f106, %f17, %f105, %f102;
	st.shared.f32 	[%r1051+8], %f106;
	add.s64 	%rd82, %rd81, %rd80;
	ld.global.nc.f32 	%f107, [%rd82];
	fma.rn.ftz.f32 	%f108, %f17, %f107, %f103;
	st.shared.f32 	[%r1051+12], %f108;
	add.s64 	%rd83, %rd82, %rd80;
	add.s32 	%r103, %r1051, 16;
	ld.global.nc.f32 	%f109, [%rd83];
	fma.rn.ftz.f32 	%f110, %f17, %f109, %f104;
	st.shared.f32 	[%r1051+16], %f110;
	add.s32 	%r1050, %r1050, %r97;
	add.s32 	%r1052, %r1052, 4;
	setp.lt.s32	%p48, %r1052, %r1108;
	mov.u32 	%r1051, %r103;
	@%p48 bra 	BB14_56;

BB14_57:
	add.s64 	%rd13, %rd10, %rd41;
	add.s64 	%rd14, %rd12, %rd41;
	ld.global.nc.f32 	%f111, [%rd14];
	ld.global.nc.f32 	%f112, [%rd13];
	sub.ftz.f32 	%f18, %f112, %f111;
	@%p19 bra 	BB14_67;

	and.b16  	%rs8, %rs1, 255;
	setp.eq.s16	%p50, %rs8, 0;
	selp.b32	%r531, 10, 11, %p50;
	mul.lo.s32 	%r107, %r531, 3;
	and.b32  	%r108, %r1108, 3;
	setp.eq.s32	%p51, %r108, 0;
	mov.u32 	%r1058, 0;
	@%p51 bra 	BB14_64;

	setp.eq.s32	%p52, %r108, 1;
	mov.u32 	%r1054, 0;
	@%p52 bra 	BB14_63;

	setp.eq.s32	%p53, %r108, 2;
	mov.u32 	%r1053, 0;
	@%p53 bra 	BB14_62;

	mad.lo.s32 	%r535, %r107, %r41, %r39;
	mul.wide.s32 	%rd85, %r535, 4;
	add.s64 	%rd86, %rd2, %rd85;
	ld.global.nc.f32 	%f113, [%rd86];
	ld.shared.f32 	%f114, [%r352+4];
	fma.rn.ftz.f32 	%f115, %f18, %f113, %f114;
	st.shared.f32 	[%r352+4], %f115;
	mov.u32 	%r1053, 1;

BB14_62:
	add.s32 	%r544, %r1053, %r107;
	mad.lo.s32 	%r545, %r544, %r41, %r39;
	mul.wide.s32 	%rd87, %r545, 4;
	add.s64 	%rd88, %rd2, %rd87;
	ld.global.nc.f32 	%f116, [%rd88];
	add.s32 	%r1054, %r1053, 1;
	mad.lo.s32 	%r550, %r348, 12, %r1053;
	shl.b32 	%r551, %r550, 2;
	add.s32 	%r553, %r551, %r351;
	ld.shared.f32 	%f117, [%r553+4];
	fma.rn.ftz.f32 	%f118, %f18, %f116, %f117;
	st.shared.f32 	[%r553+4], %f118;

BB14_63:
	add.s32 	%r554, %r1054, %r107;
	mad.lo.s32 	%r555, %r554, %r41, %r39;
	mul.wide.s32 	%rd89, %r555, 4;
	add.s64 	%rd90, %rd2, %rd89;
	ld.global.nc.f32 	%f119, [%rd90];
	add.s32 	%r1058, %r1054, 1;
	mad.lo.s32 	%r560, %r348, 12, %r1054;
	shl.b32 	%r561, %r560, 2;
	add.s32 	%r563, %r561, %r351;
	ld.shared.f32 	%f120, [%r563+4];
	fma.rn.ftz.f32 	%f121, %f18, %f119, %f120;
	st.shared.f32 	[%r563+4], %f121;

BB14_64:
	setp.lt.u32	%p54, %r1108, 4;
	@%p54 bra 	BB14_67;

	mul.lo.s32 	%r566, %r347, %r2;
	mad.lo.s32 	%r567, %r566, 12, %r1058;
	mad.lo.s32 	%r569, %r346, 12, %r567;
	shl.b32 	%r570, %r569, 2;
	add.s32 	%r1057, %r351, %r570;
	shl.b32 	%r115, %r41, 2;
	mad.lo.s32 	%r576, %r531, 3, %r1058;
	mul.lo.s32 	%r1056, %r41, %r576;

BB14_66:
	add.s32 	%r577, %r1056, %r39;
	mul.wide.s32 	%rd91, %r577, 4;
	add.s64 	%rd92, %rd2, %rd91;
	ld.global.nc.f32 	%f122, [%rd92];
	ld.shared.f32 	%f123, [%r1057+4];
	fma.rn.ftz.f32 	%f124, %f18, %f122, %f123;
	ld.shared.f32 	%f125, [%r1057+8];
	ld.shared.f32 	%f126, [%r1057+12];
	ld.shared.f32 	%f127, [%r1057+16];
	st.shared.f32 	[%r1057+4], %f124;
	cvt.s64.s32	%rd93, %r115;
	add.s64 	%rd94, %rd92, %rd93;
	ld.global.nc.f32 	%f128, [%rd94];
	fma.rn.ftz.f32 	%f129, %f18, %f128, %f125;
	st.shared.f32 	[%r1057+8], %f129;
	add.s64 	%rd95, %rd94, %rd93;
	ld.global.nc.f32 	%f130, [%rd95];
	fma.rn.ftz.f32 	%f131, %f18, %f130, %f126;
	st.shared.f32 	[%r1057+12], %f131;
	add.s64 	%rd96, %rd95, %rd93;
	add.s32 	%r121, %r1057, 16;
	ld.global.nc.f32 	%f132, [%rd96];
	fma.rn.ftz.f32 	%f133, %f18, %f132, %f127;
	st.shared.f32 	[%r1057+16], %f133;
	add.s32 	%r1056, %r1056, %r115;
	add.s32 	%r1058, %r1058, 4;
	setp.lt.s32	%p56, %r1058, %r1108;
	mov.u32 	%r1057, %r121;
	@%p56 bra 	BB14_66;

BB14_67:
	add.s64 	%rd15, %rd13, %rd41;
	add.s64 	%rd16, %rd14, %rd41;
	ld.global.nc.f32 	%f134, [%rd16];
	ld.global.nc.f32 	%f135, [%rd15];
	sub.ftz.f32 	%f19, %f135, %f134;
	@%p19 bra 	BB14_77;

	and.b16  	%rs10, %rs1, 255;
	setp.eq.s16	%p58, %rs10, 0;
	selp.b32	%r579, 10, 11, %p58;
	shl.b32 	%r124, %r579, 2;
	and.b32  	%r125, %r1108, 3;
	setp.eq.s32	%p59, %r125, 0;
	mov.u32 	%r1064, 0;
	@%p59 bra 	BB14_74;

	setp.eq.s32	%p60, %r125, 1;
	mov.u32 	%r1060, 0;
	@%p60 bra 	BB14_73;

	setp.eq.s32	%p61, %r125, 2;
	mov.u32 	%r1059, 0;
	@%p61 bra 	BB14_72;

	mad.lo.s32 	%r583, %r124, %r41, %r39;
	mul.wide.s32 	%rd98, %r583, 4;
	add.s64 	%rd99, %rd2, %rd98;
	ld.global.nc.f32 	%f136, [%rd99];
	ld.shared.f32 	%f137, [%r352+4];
	fma.rn.ftz.f32 	%f138, %f19, %f136, %f137;
	st.shared.f32 	[%r352+4], %f138;
	mov.u32 	%r1059, 1;

BB14_72:
	add.s32 	%r592, %r1059, %r124;
	mad.lo.s32 	%r593, %r592, %r41, %r39;
	mul.wide.s32 	%rd100, %r593, 4;
	add.s64 	%rd101, %rd2, %rd100;
	ld.global.nc.f32 	%f139, [%rd101];
	add.s32 	%r1060, %r1059, 1;
	mad.lo.s32 	%r598, %r348, 12, %r1059;
	shl.b32 	%r599, %r598, 2;
	add.s32 	%r601, %r599, %r351;
	ld.shared.f32 	%f140, [%r601+4];
	fma.rn.ftz.f32 	%f141, %f19, %f139, %f140;
	st.shared.f32 	[%r601+4], %f141;

BB14_73:
	add.s32 	%r602, %r1060, %r124;
	mad.lo.s32 	%r603, %r602, %r41, %r39;
	mul.wide.s32 	%rd102, %r603, 4;
	add.s64 	%rd103, %rd2, %rd102;
	ld.global.nc.f32 	%f142, [%rd103];
	add.s32 	%r1064, %r1060, 1;
	mad.lo.s32 	%r608, %r348, 12, %r1060;
	shl.b32 	%r609, %r608, 2;
	add.s32 	%r611, %r609, %r351;
	ld.shared.f32 	%f143, [%r611+4];
	fma.rn.ftz.f32 	%f144, %f19, %f142, %f143;
	st.shared.f32 	[%r611+4], %f144;

BB14_74:
	setp.lt.u32	%p62, %r1108, 4;
	@%p62 bra 	BB14_77;

	mul.lo.s32 	%r614, %r347, %r2;
	mad.lo.s32 	%r615, %r614, 12, %r1064;
	mad.lo.s32 	%r617, %r346, 12, %r615;
	shl.b32 	%r618, %r617, 2;
	add.s32 	%r1063, %r351, %r618;
	shl.b32 	%r132, %r41, 2;
	mad.lo.s32 	%r624, %r579, 4, %r1064;
	mul.lo.s32 	%r1062, %r41, %r624;

BB14_76:
	add.s32 	%r625, %r1062, %r39;
	mul.wide.s32 	%rd104, %r625, 4;
	add.s64 	%rd105, %rd2, %rd104;
	ld.global.nc.f32 	%f145, [%rd105];
	ld.shared.f32 	%f146, [%r1063+4];
	fma.rn.ftz.f32 	%f147, %f19, %f145, %f146;
	ld.shared.f32 	%f148, [%r1063+8];
	ld.shared.f32 	%f149, [%r1063+12];
	ld.shared.f32 	%f150, [%r1063+16];
	st.shared.f32 	[%r1063+4], %f147;
	cvt.s64.s32	%rd106, %r132;
	add.s64 	%rd107, %rd105, %rd106;
	ld.global.nc.f32 	%f151, [%rd107];
	fma.rn.ftz.f32 	%f152, %f19, %f151, %f148;
	st.shared.f32 	[%r1063+8], %f152;
	add.s64 	%rd108, %rd107, %rd106;
	ld.global.nc.f32 	%f153, [%rd108];
	fma.rn.ftz.f32 	%f154, %f19, %f153, %f149;
	st.shared.f32 	[%r1063+12], %f154;
	add.s64 	%rd109, %rd108, %rd106;
	add.s32 	%r138, %r1063, 16;
	ld.global.nc.f32 	%f155, [%rd109];
	fma.rn.ftz.f32 	%f156, %f19, %f155, %f150;
	st.shared.f32 	[%r1063+16], %f156;
	add.s32 	%r1062, %r1062, %r132;
	add.s32 	%r1064, %r1064, 4;
	setp.lt.s32	%p64, %r1064, %r1108;
	mov.u32 	%r1063, %r138;
	@%p64 bra 	BB14_76;

BB14_77:
	add.s64 	%rd17, %rd15, %rd41;
	add.s64 	%rd18, %rd16, %rd41;
	ld.global.nc.f32 	%f157, [%rd18];
	ld.global.nc.f32 	%f158, [%rd17];
	sub.ftz.f32 	%f20, %f158, %f157;
	@%p19 bra 	BB14_87;

	and.b16  	%rs12, %rs1, 255;
	setp.eq.s16	%p66, %rs12, 0;
	selp.b32	%r627, 10, 11, %p66;
	mul.lo.s32 	%r141, %r627, 5;
	and.b32  	%r142, %r1108, 3;
	setp.eq.s32	%p67, %r142, 0;
	mov.u32 	%r1070, 0;
	@%p67 bra 	BB14_84;

	setp.eq.s32	%p68, %r142, 1;
	mov.u32 	%r1066, 0;
	@%p68 bra 	BB14_83;

	setp.eq.s32	%p69, %r142, 2;
	mov.u32 	%r1065, 0;
	@%p69 bra 	BB14_82;

	mad.lo.s32 	%r631, %r141, %r41, %r39;
	mul.wide.s32 	%rd111, %r631, 4;
	add.s64 	%rd112, %rd2, %rd111;
	ld.global.nc.f32 	%f159, [%rd112];
	ld.shared.f32 	%f160, [%r352+4];
	fma.rn.ftz.f32 	%f161, %f20, %f159, %f160;
	st.shared.f32 	[%r352+4], %f161;
	mov.u32 	%r1065, 1;

BB14_82:
	add.s32 	%r640, %r1065, %r141;
	mad.lo.s32 	%r641, %r640, %r41, %r39;
	mul.wide.s32 	%rd113, %r641, 4;
	add.s64 	%rd114, %rd2, %rd113;
	ld.global.nc.f32 	%f162, [%rd114];
	add.s32 	%r1066, %r1065, 1;
	mad.lo.s32 	%r646, %r348, 12, %r1065;
	shl.b32 	%r647, %r646, 2;
	add.s32 	%r649, %r647, %r351;
	ld.shared.f32 	%f163, [%r649+4];
	fma.rn.ftz.f32 	%f164, %f20, %f162, %f163;
	st.shared.f32 	[%r649+4], %f164;

BB14_83:
	add.s32 	%r650, %r1066, %r141;
	mad.lo.s32 	%r651, %r650, %r41, %r39;
	mul.wide.s32 	%rd115, %r651, 4;
	add.s64 	%rd116, %rd2, %rd115;
	ld.global.nc.f32 	%f165, [%rd116];
	add.s32 	%r1070, %r1066, 1;
	mad.lo.s32 	%r656, %r348, 12, %r1066;
	shl.b32 	%r657, %r656, 2;
	add.s32 	%r659, %r657, %r351;
	ld.shared.f32 	%f166, [%r659+4];
	fma.rn.ftz.f32 	%f167, %f20, %f165, %f166;
	st.shared.f32 	[%r659+4], %f167;

BB14_84:
	setp.lt.u32	%p70, %r1108, 4;
	@%p70 bra 	BB14_87;

	mul.lo.s32 	%r662, %r347, %r2;
	mad.lo.s32 	%r663, %r662, 12, %r1070;
	mad.lo.s32 	%r665, %r346, 12, %r663;
	shl.b32 	%r666, %r665, 2;
	add.s32 	%r1069, %r351, %r666;
	shl.b32 	%r149, %r41, 2;
	mad.lo.s32 	%r672, %r627, 5, %r1070;
	mul.lo.s32 	%r1068, %r41, %r672;

BB14_86:
	add.s32 	%r673, %r1068, %r39;
	mul.wide.s32 	%rd117, %r673, 4;
	add.s64 	%rd118, %rd2, %rd117;
	ld.global.nc.f32 	%f168, [%rd118];
	ld.shared.f32 	%f169, [%r1069+4];
	fma.rn.ftz.f32 	%f170, %f20, %f168, %f169;
	ld.shared.f32 	%f171, [%r1069+8];
	ld.shared.f32 	%f172, [%r1069+12];
	ld.shared.f32 	%f173, [%r1069+16];
	st.shared.f32 	[%r1069+4], %f170;
	cvt.s64.s32	%rd119, %r149;
	add.s64 	%rd120, %rd118, %rd119;
	ld.global.nc.f32 	%f174, [%rd120];
	fma.rn.ftz.f32 	%f175, %f20, %f174, %f171;
	st.shared.f32 	[%r1069+8], %f175;
	add.s64 	%rd121, %rd120, %rd119;
	ld.global.nc.f32 	%f176, [%rd121];
	fma.rn.ftz.f32 	%f177, %f20, %f176, %f172;
	st.shared.f32 	[%r1069+12], %f177;
	add.s64 	%rd122, %rd121, %rd119;
	add.s32 	%r155, %r1069, 16;
	ld.global.nc.f32 	%f178, [%rd122];
	fma.rn.ftz.f32 	%f179, %f20, %f178, %f173;
	st.shared.f32 	[%r1069+16], %f179;
	add.s32 	%r1068, %r1068, %r149;
	add.s32 	%r1070, %r1070, 4;
	setp.lt.s32	%p72, %r1070, %r1108;
	mov.u32 	%r1069, %r155;
	@%p72 bra 	BB14_86;

BB14_87:
	add.s64 	%rd19, %rd17, %rd41;
	add.s64 	%rd20, %rd18, %rd41;
	ld.global.nc.f32 	%f180, [%rd20];
	ld.global.nc.f32 	%f181, [%rd19];
	sub.ftz.f32 	%f21, %f181, %f180;
	@%p19 bra 	BB14_97;

	and.b16  	%rs14, %rs1, 255;
	setp.eq.s16	%p74, %rs14, 0;
	selp.b32	%r675, 10, 11, %p74;
	mul.lo.s32 	%r158, %r675, 6;
	and.b32  	%r159, %r1108, 3;
	setp.eq.s32	%p75, %r159, 0;
	mov.u32 	%r1076, 0;
	@%p75 bra 	BB14_94;

	setp.eq.s32	%p76, %r159, 1;
	mov.u32 	%r1072, 0;
	@%p76 bra 	BB14_93;

	setp.eq.s32	%p77, %r159, 2;
	mov.u32 	%r1071, 0;
	@%p77 bra 	BB14_92;

	mad.lo.s32 	%r679, %r158, %r41, %r39;
	mul.wide.s32 	%rd124, %r679, 4;
	add.s64 	%rd125, %rd2, %rd124;
	ld.global.nc.f32 	%f182, [%rd125];
	ld.shared.f32 	%f183, [%r352+4];
	fma.rn.ftz.f32 	%f184, %f21, %f182, %f183;
	st.shared.f32 	[%r352+4], %f184;
	mov.u32 	%r1071, 1;

BB14_92:
	add.s32 	%r688, %r1071, %r158;
	mad.lo.s32 	%r689, %r688, %r41, %r39;
	mul.wide.s32 	%rd126, %r689, 4;
	add.s64 	%rd127, %rd2, %rd126;
	ld.global.nc.f32 	%f185, [%rd127];
	add.s32 	%r1072, %r1071, 1;
	mad.lo.s32 	%r694, %r348, 12, %r1071;
	shl.b32 	%r695, %r694, 2;
	add.s32 	%r697, %r695, %r351;
	ld.shared.f32 	%f186, [%r697+4];
	fma.rn.ftz.f32 	%f187, %f21, %f185, %f186;
	st.shared.f32 	[%r697+4], %f187;

BB14_93:
	add.s32 	%r698, %r1072, %r158;
	mad.lo.s32 	%r699, %r698, %r41, %r39;
	mul.wide.s32 	%rd128, %r699, 4;
	add.s64 	%rd129, %rd2, %rd128;
	ld.global.nc.f32 	%f188, [%rd129];
	add.s32 	%r1076, %r1072, 1;
	mad.lo.s32 	%r704, %r348, 12, %r1072;
	shl.b32 	%r705, %r704, 2;
	add.s32 	%r707, %r705, %r351;
	ld.shared.f32 	%f189, [%r707+4];
	fma.rn.ftz.f32 	%f190, %f21, %f188, %f189;
	st.shared.f32 	[%r707+4], %f190;

BB14_94:
	setp.lt.u32	%p78, %r1108, 4;
	@%p78 bra 	BB14_97;

	mul.lo.s32 	%r710, %r347, %r2;
	mad.lo.s32 	%r711, %r710, 12, %r1076;
	mad.lo.s32 	%r713, %r346, 12, %r711;
	shl.b32 	%r714, %r713, 2;
	add.s32 	%r1075, %r351, %r714;
	shl.b32 	%r166, %r41, 2;
	mad.lo.s32 	%r720, %r675, 6, %r1076;
	mul.lo.s32 	%r1074, %r41, %r720;

BB14_96:
	add.s32 	%r721, %r1074, %r39;
	mul.wide.s32 	%rd130, %r721, 4;
	add.s64 	%rd131, %rd2, %rd130;
	ld.global.nc.f32 	%f191, [%rd131];
	ld.shared.f32 	%f192, [%r1075+4];
	fma.rn.ftz.f32 	%f193, %f21, %f191, %f192;
	ld.shared.f32 	%f194, [%r1075+8];
	ld.shared.f32 	%f195, [%r1075+12];
	ld.shared.f32 	%f196, [%r1075+16];
	st.shared.f32 	[%r1075+4], %f193;
	cvt.s64.s32	%rd132, %r166;
	add.s64 	%rd133, %rd131, %rd132;
	ld.global.nc.f32 	%f197, [%rd133];
	fma.rn.ftz.f32 	%f198, %f21, %f197, %f194;
	st.shared.f32 	[%r1075+8], %f198;
	add.s64 	%rd134, %rd133, %rd132;
	ld.global.nc.f32 	%f199, [%rd134];
	fma.rn.ftz.f32 	%f200, %f21, %f199, %f195;
	st.shared.f32 	[%r1075+12], %f200;
	add.s64 	%rd135, %rd134, %rd132;
	add.s32 	%r172, %r1075, 16;
	ld.global.nc.f32 	%f201, [%rd135];
	fma.rn.ftz.f32 	%f202, %f21, %f201, %f196;
	st.shared.f32 	[%r1075+16], %f202;
	add.s32 	%r1074, %r1074, %r166;
	add.s32 	%r1076, %r1076, 4;
	setp.lt.s32	%p80, %r1076, %r1108;
	mov.u32 	%r1075, %r172;
	@%p80 bra 	BB14_96;

BB14_97:
	add.s64 	%rd21, %rd19, %rd41;
	add.s64 	%rd22, %rd20, %rd41;
	ld.global.nc.f32 	%f203, [%rd22];
	ld.global.nc.f32 	%f204, [%rd21];
	sub.ftz.f32 	%f22, %f204, %f203;
	@%p19 bra 	BB14_107;

	and.b16  	%rs16, %rs1, 255;
	setp.eq.s16	%p82, %rs16, 0;
	selp.b32	%r723, 10, 11, %p82;
	mul.lo.s32 	%r175, %r723, 7;
	and.b32  	%r176, %r1108, 3;
	setp.eq.s32	%p83, %r176, 0;
	mov.u32 	%r1082, 0;
	@%p83 bra 	BB14_104;

	setp.eq.s32	%p84, %r176, 1;
	mov.u32 	%r1078, 0;
	@%p84 bra 	BB14_103;

	setp.eq.s32	%p85, %r176, 2;
	mov.u32 	%r1077, 0;
	@%p85 bra 	BB14_102;

	mad.lo.s32 	%r727, %r175, %r41, %r39;
	mul.wide.s32 	%rd137, %r727, 4;
	add.s64 	%rd138, %rd2, %rd137;
	ld.global.nc.f32 	%f205, [%rd138];
	ld.shared.f32 	%f206, [%r352+4];
	fma.rn.ftz.f32 	%f207, %f22, %f205, %f206;
	st.shared.f32 	[%r352+4], %f207;
	mov.u32 	%r1077, 1;

BB14_102:
	add.s32 	%r736, %r1077, %r175;
	mad.lo.s32 	%r737, %r736, %r41, %r39;
	mul.wide.s32 	%rd139, %r737, 4;
	add.s64 	%rd140, %rd2, %rd139;
	ld.global.nc.f32 	%f208, [%rd140];
	add.s32 	%r1078, %r1077, 1;
	mad.lo.s32 	%r742, %r348, 12, %r1077;
	shl.b32 	%r743, %r742, 2;
	add.s32 	%r745, %r743, %r351;
	ld.shared.f32 	%f209, [%r745+4];
	fma.rn.ftz.f32 	%f210, %f22, %f208, %f209;
	st.shared.f32 	[%r745+4], %f210;

BB14_103:
	add.s32 	%r746, %r1078, %r175;
	mad.lo.s32 	%r747, %r746, %r41, %r39;
	mul.wide.s32 	%rd141, %r747, 4;
	add.s64 	%rd142, %rd2, %rd141;
	ld.global.nc.f32 	%f211, [%rd142];
	add.s32 	%r1082, %r1078, 1;
	mad.lo.s32 	%r752, %r348, 12, %r1078;
	shl.b32 	%r753, %r752, 2;
	add.s32 	%r755, %r753, %r351;
	ld.shared.f32 	%f212, [%r755+4];
	fma.rn.ftz.f32 	%f213, %f22, %f211, %f212;
	st.shared.f32 	[%r755+4], %f213;

BB14_104:
	setp.lt.u32	%p86, %r1108, 4;
	@%p86 bra 	BB14_107;

	mul.lo.s32 	%r758, %r347, %r2;
	mad.lo.s32 	%r759, %r758, 12, %r1082;
	mad.lo.s32 	%r761, %r346, 12, %r759;
	shl.b32 	%r762, %r761, 2;
	add.s32 	%r1081, %r351, %r762;
	shl.b32 	%r183, %r41, 2;
	mad.lo.s32 	%r768, %r723, 7, %r1082;
	mul.lo.s32 	%r1080, %r41, %r768;

BB14_106:
	add.s32 	%r769, %r1080, %r39;
	mul.wide.s32 	%rd143, %r769, 4;
	add.s64 	%rd144, %rd2, %rd143;
	ld.global.nc.f32 	%f214, [%rd144];
	ld.shared.f32 	%f215, [%r1081+4];
	fma.rn.ftz.f32 	%f216, %f22, %f214, %f215;
	ld.shared.f32 	%f217, [%r1081+8];
	ld.shared.f32 	%f218, [%r1081+12];
	ld.shared.f32 	%f219, [%r1081+16];
	st.shared.f32 	[%r1081+4], %f216;
	cvt.s64.s32	%rd145, %r183;
	add.s64 	%rd146, %rd144, %rd145;
	ld.global.nc.f32 	%f220, [%rd146];
	fma.rn.ftz.f32 	%f221, %f22, %f220, %f217;
	st.shared.f32 	[%r1081+8], %f221;
	add.s64 	%rd147, %rd146, %rd145;
	ld.global.nc.f32 	%f222, [%rd147];
	fma.rn.ftz.f32 	%f223, %f22, %f222, %f218;
	st.shared.f32 	[%r1081+12], %f223;
	add.s64 	%rd148, %rd147, %rd145;
	add.s32 	%r189, %r1081, 16;
	ld.global.nc.f32 	%f224, [%rd148];
	fma.rn.ftz.f32 	%f225, %f22, %f224, %f219;
	st.shared.f32 	[%r1081+16], %f225;
	add.s32 	%r1080, %r1080, %r183;
	add.s32 	%r1082, %r1082, 4;
	setp.lt.s32	%p88, %r1082, %r1108;
	mov.u32 	%r1081, %r189;
	@%p88 bra 	BB14_106;

BB14_107:
	add.s64 	%rd23, %rd21, %rd41;
	add.s64 	%rd24, %rd22, %rd41;
	ld.global.nc.f32 	%f226, [%rd24];
	ld.global.nc.f32 	%f227, [%rd23];
	sub.ftz.f32 	%f23, %f227, %f226;
	@%p19 bra 	BB14_117;

	and.b16  	%rs18, %rs1, 255;
	setp.eq.s16	%p90, %rs18, 0;
	selp.b32	%r771, 10, 11, %p90;
	shl.b32 	%r192, %r771, 3;
	and.b32  	%r193, %r1108, 3;
	setp.eq.s32	%p91, %r193, 0;
	mov.u32 	%r1088, 0;
	@%p91 bra 	BB14_114;

	setp.eq.s32	%p92, %r193, 1;
	mov.u32 	%r1084, 0;
	@%p92 bra 	BB14_113;

	setp.eq.s32	%p93, %r193, 2;
	mov.u32 	%r1083, 0;
	@%p93 bra 	BB14_112;

	mad.lo.s32 	%r775, %r192, %r41, %r39;
	mul.wide.s32 	%rd150, %r775, 4;
	add.s64 	%rd151, %rd2, %rd150;
	ld.global.nc.f32 	%f228, [%rd151];
	ld.shared.f32 	%f229, [%r352+4];
	fma.rn.ftz.f32 	%f230, %f23, %f228, %f229;
	st.shared.f32 	[%r352+4], %f230;
	mov.u32 	%r1083, 1;

BB14_112:
	add.s32 	%r784, %r1083, %r192;
	mad.lo.s32 	%r785, %r784, %r41, %r39;
	mul.wide.s32 	%rd152, %r785, 4;
	add.s64 	%rd153, %rd2, %rd152;
	ld.global.nc.f32 	%f231, [%rd153];
	add.s32 	%r1084, %r1083, 1;
	mad.lo.s32 	%r790, %r348, 12, %r1083;
	shl.b32 	%r791, %r790, 2;
	add.s32 	%r793, %r791, %r351;
	ld.shared.f32 	%f232, [%r793+4];
	fma.rn.ftz.f32 	%f233, %f23, %f231, %f232;
	st.shared.f32 	[%r793+4], %f233;

BB14_113:
	add.s32 	%r794, %r1084, %r192;
	mad.lo.s32 	%r795, %r794, %r41, %r39;
	mul.wide.s32 	%rd154, %r795, 4;
	add.s64 	%rd155, %rd2, %rd154;
	ld.global.nc.f32 	%f234, [%rd155];
	add.s32 	%r1088, %r1084, 1;
	mad.lo.s32 	%r800, %r348, 12, %r1084;
	shl.b32 	%r801, %r800, 2;
	add.s32 	%r803, %r801, %r351;
	ld.shared.f32 	%f235, [%r803+4];
	fma.rn.ftz.f32 	%f236, %f23, %f234, %f235;
	st.shared.f32 	[%r803+4], %f236;

BB14_114:
	setp.lt.u32	%p94, %r1108, 4;
	@%p94 bra 	BB14_117;

	mul.lo.s32 	%r806, %r347, %r2;
	mad.lo.s32 	%r807, %r806, 12, %r1088;
	mad.lo.s32 	%r809, %r346, 12, %r807;
	shl.b32 	%r810, %r809, 2;
	add.s32 	%r1087, %r351, %r810;
	shl.b32 	%r200, %r41, 2;
	mad.lo.s32 	%r816, %r771, 8, %r1088;
	mul.lo.s32 	%r1086, %r41, %r816;

BB14_116:
	add.s32 	%r817, %r1086, %r39;
	mul.wide.s32 	%rd156, %r817, 4;
	add.s64 	%rd157, %rd2, %rd156;
	ld.global.nc.f32 	%f237, [%rd157];
	ld.shared.f32 	%f238, [%r1087+4];
	fma.rn.ftz.f32 	%f239, %f23, %f237, %f238;
	ld.shared.f32 	%f240, [%r1087+8];
	ld.shared.f32 	%f241, [%r1087+12];
	ld.shared.f32 	%f242, [%r1087+16];
	st.shared.f32 	[%r1087+4], %f239;
	cvt.s64.s32	%rd158, %r200;
	add.s64 	%rd159, %rd157, %rd158;
	ld.global.nc.f32 	%f243, [%rd159];
	fma.rn.ftz.f32 	%f244, %f23, %f243, %f240;
	st.shared.f32 	[%r1087+8], %f244;
	add.s64 	%rd160, %rd159, %rd158;
	ld.global.nc.f32 	%f245, [%rd160];
	fma.rn.ftz.f32 	%f246, %f23, %f245, %f241;
	st.shared.f32 	[%r1087+12], %f246;
	add.s64 	%rd161, %rd160, %rd158;
	add.s32 	%r206, %r1087, 16;
	ld.global.nc.f32 	%f247, [%rd161];
	fma.rn.ftz.f32 	%f248, %f23, %f247, %f242;
	st.shared.f32 	[%r1087+16], %f248;
	add.s32 	%r1086, %r1086, %r200;
	add.s32 	%r1088, %r1088, 4;
	setp.lt.s32	%p96, %r1088, %r1108;
	mov.u32 	%r1087, %r206;
	@%p96 bra 	BB14_116;

BB14_117:
	add.s64 	%rd163, %rd23, %rd41;
	add.s64 	%rd164, %rd24, %rd41;
	ld.global.nc.f32 	%f249, [%rd164];
	ld.global.nc.f32 	%f250, [%rd163];
	sub.ftz.f32 	%f24, %f250, %f249;
	@%p19 bra 	BB14_127;

	and.b16  	%rs20, %rs1, 255;
	setp.eq.s16	%p98, %rs20, 0;
	selp.b32	%r819, 10, 11, %p98;
	mul.lo.s32 	%r209, %r819, 9;
	and.b32  	%r210, %r1108, 3;
	setp.eq.s32	%p99, %r210, 0;
	mov.u32 	%r1094, 0;
	@%p99 bra 	BB14_124;

	setp.eq.s32	%p100, %r210, 1;
	mov.u32 	%r1090, 0;
	@%p100 bra 	BB14_123;

	setp.eq.s32	%p101, %r210, 2;
	mov.u32 	%r1089, 0;
	@%p101 bra 	BB14_122;

	mad.lo.s32 	%r823, %r209, %r41, %r39;
	mul.wide.s32 	%rd165, %r823, 4;
	add.s64 	%rd166, %rd2, %rd165;
	ld.global.nc.f32 	%f251, [%rd166];
	ld.shared.f32 	%f252, [%r352+4];
	fma.rn.ftz.f32 	%f253, %f24, %f251, %f252;
	st.shared.f32 	[%r352+4], %f253;
	mov.u32 	%r1089, 1;

BB14_122:
	add.s32 	%r832, %r1089, %r209;
	mad.lo.s32 	%r833, %r832, %r41, %r39;
	mul.wide.s32 	%rd167, %r833, 4;
	add.s64 	%rd168, %rd2, %rd167;
	ld.global.nc.f32 	%f254, [%rd168];
	add.s32 	%r1090, %r1089, 1;
	mad.lo.s32 	%r838, %r348, 12, %r1089;
	shl.b32 	%r839, %r838, 2;
	add.s32 	%r841, %r839, %r351;
	ld.shared.f32 	%f255, [%r841+4];
	fma.rn.ftz.f32 	%f256, %f24, %f254, %f255;
	st.shared.f32 	[%r841+4], %f256;

BB14_123:
	add.s32 	%r842, %r1090, %r209;
	mad.lo.s32 	%r843, %r842, %r41, %r39;
	mul.wide.s32 	%rd169, %r843, 4;
	add.s64 	%rd170, %rd2, %rd169;
	ld.global.nc.f32 	%f257, [%rd170];
	add.s32 	%r1094, %r1090, 1;
	mad.lo.s32 	%r848, %r348, 12, %r1090;
	shl.b32 	%r849, %r848, 2;
	add.s32 	%r851, %r849, %r351;
	ld.shared.f32 	%f258, [%r851+4];
	fma.rn.ftz.f32 	%f259, %f24, %f257, %f258;
	st.shared.f32 	[%r851+4], %f259;

BB14_124:
	setp.lt.u32	%p102, %r1108, 4;
	@%p102 bra 	BB14_127;

	mul.lo.s32 	%r854, %r347, %r2;
	mad.lo.s32 	%r855, %r854, 12, %r1094;
	mad.lo.s32 	%r857, %r346, 12, %r855;
	shl.b32 	%r858, %r857, 2;
	add.s32 	%r1093, %r351, %r858;
	shl.b32 	%r217, %r41, 2;
	mad.lo.s32 	%r864, %r819, 9, %r1094;
	mul.lo.s32 	%r1092, %r41, %r864;

BB14_126:
	add.s32 	%r865, %r1092, %r39;
	mul.wide.s32 	%rd171, %r865, 4;
	add.s64 	%rd172, %rd2, %rd171;
	ld.global.nc.f32 	%f260, [%rd172];
	ld.shared.f32 	%f261, [%r1093+4];
	fma.rn.ftz.f32 	%f262, %f24, %f260, %f261;
	ld.shared.f32 	%f263, [%r1093+8];
	ld.shared.f32 	%f264, [%r1093+12];
	ld.shared.f32 	%f265, [%r1093+16];
	st.shared.f32 	[%r1093+4], %f262;
	cvt.s64.s32	%rd173, %r217;
	add.s64 	%rd174, %rd172, %rd173;
	ld.global.nc.f32 	%f266, [%rd174];
	fma.rn.ftz.f32 	%f267, %f24, %f266, %f263;
	st.shared.f32 	[%r1093+8], %f267;
	add.s64 	%rd175, %rd174, %rd173;
	ld.global.nc.f32 	%f268, [%rd175];
	fma.rn.ftz.f32 	%f269, %f24, %f268, %f264;
	st.shared.f32 	[%r1093+12], %f269;
	add.s64 	%rd176, %rd175, %rd173;
	add.s32 	%r223, %r1093, 16;
	ld.global.nc.f32 	%f270, [%rd176];
	fma.rn.ftz.f32 	%f271, %f24, %f270, %f265;
	st.shared.f32 	[%r1093+16], %f271;
	add.s32 	%r1092, %r1092, %r217;
	add.s32 	%r1094, %r1094, 4;
	setp.lt.s32	%p104, %r1094, %r1108;
	mov.u32 	%r1093, %r223;
	@%p104 bra 	BB14_126;

BB14_127:
	and.b16  	%rs22, %rs1, 255;
	setp.eq.s16	%p106, %rs22, 0;
	or.pred  	%p107, %p19, %p106;
	@%p107 bra 	BB14_137;

	selp.b32	%r867, 10, 11, %p106;
	mul.lo.s32 	%r226, %r867, 10;
	and.b32  	%r227, %r1108, 3;
	setp.eq.s32	%p109, %r227, 0;
	mov.u32 	%r1100, 0;
	@%p109 bra 	BB14_134;

	setp.eq.s32	%p110, %r227, 1;
	mov.u32 	%r1096, 0;
	@%p110 bra 	BB14_133;

	setp.eq.s32	%p111, %r227, 2;
	mov.u32 	%r1095, 0;
	@%p111 bra 	BB14_132;

	mad.lo.s32 	%r871, %r226, %r41, %r39;
	mul.wide.s32 	%rd177, %r871, 4;
	add.s64 	%rd178, %rd2, %rd177;
	ld.global.nc.f32 	%f272, [%rd178];
	ld.shared.f32 	%f273, [%r352+4];
	fma.rn.ftz.f32 	%f274, %f272, 0f00000000, %f273;
	st.shared.f32 	[%r352+4], %f274;
	mov.u32 	%r1095, 1;

BB14_132:
	add.s32 	%r880, %r1095, %r226;
	mad.lo.s32 	%r881, %r880, %r41, %r39;
	mul.wide.s32 	%rd179, %r881, 4;
	add.s64 	%rd180, %rd2, %rd179;
	ld.global.nc.f32 	%f275, [%rd180];
	add.s32 	%r1096, %r1095, 1;
	mad.lo.s32 	%r886, %r348, 12, %r1095;
	shl.b32 	%r887, %r886, 2;
	add.s32 	%r889, %r887, %r351;
	ld.shared.f32 	%f276, [%r889+4];
	fma.rn.ftz.f32 	%f277, %f275, 0f00000000, %f276;
	st.shared.f32 	[%r889+4], %f277;

BB14_133:
	add.s32 	%r890, %r1096, %r226;
	mad.lo.s32 	%r891, %r890, %r41, %r39;
	mul.wide.s32 	%rd181, %r891, 4;
	add.s64 	%rd182, %rd2, %rd181;
	ld.global.nc.f32 	%f278, [%rd182];
	add.s32 	%r1100, %r1096, 1;
	mad.lo.s32 	%r896, %r348, 12, %r1096;
	shl.b32 	%r897, %r896, 2;
	add.s32 	%r899, %r897, %r351;
	ld.shared.f32 	%f279, [%r899+4];
	fma.rn.ftz.f32 	%f280, %f278, 0f00000000, %f279;
	st.shared.f32 	[%r899+4], %f280;

BB14_134:
	setp.lt.u32	%p112, %r1108, 4;
	@%p112 bra 	BB14_137;

	mul.lo.s32 	%r902, %r347, %r2;
	mad.lo.s32 	%r903, %r902, 12, %r1100;
	mad.lo.s32 	%r905, %r346, 12, %r903;
	shl.b32 	%r906, %r905, 2;
	add.s32 	%r1099, %r351, %r906;
	shl.b32 	%r234, %r41, 2;
	mad.lo.s32 	%r912, %r867, 10, %r1100;
	mul.lo.s32 	%r1098, %r41, %r912;

BB14_136:
	add.s32 	%r913, %r1098, %r39;
	mul.wide.s32 	%rd183, %r913, 4;
	add.s64 	%rd184, %rd2, %rd183;
	ld.global.nc.f32 	%f281, [%rd184];
	ld.shared.f32 	%f282, [%r1099+4];
	fma.rn.ftz.f32 	%f283, %f281, 0f00000000, %f282;
	ld.shared.f32 	%f284, [%r1099+8];
	ld.shared.f32 	%f285, [%r1099+12];
	ld.shared.f32 	%f286, [%r1099+16];
	st.shared.f32 	[%r1099+4], %f283;
	cvt.s64.s32	%rd185, %r234;
	add.s64 	%rd186, %rd184, %rd185;
	ld.global.nc.f32 	%f287, [%rd186];
	fma.rn.ftz.f32 	%f288, %f287, 0f00000000, %f284;
	st.shared.f32 	[%r1099+8], %f288;
	add.s64 	%rd187, %rd186, %rd185;
	ld.global.nc.f32 	%f289, [%rd187];
	fma.rn.ftz.f32 	%f290, %f289, 0f00000000, %f285;
	st.shared.f32 	[%r1099+12], %f290;
	add.s64 	%rd188, %rd187, %rd185;
	add.s32 	%r240, %r1099, 16;
	ld.global.nc.f32 	%f291, [%rd188];
	fma.rn.ftz.f32 	%f292, %f291, 0f00000000, %f286;
	st.shared.f32 	[%r1099+16], %f292;
	add.s32 	%r1098, %r1098, %r234;
	add.s32 	%r1100, %r1100, 4;
	setp.lt.s32	%p114, %r1100, %r1108;
	mov.u32 	%r1099, %r240;
	@%p114 bra 	BB14_136;

BB14_137:
	setp.lt.s32	%p115, %r1108, 0;
	@%p115 bra 	BB14_151;

	mul.lo.s32 	%r917, %r347, %r2;
	mul.lo.s32 	%r919, %r346, 12;
	mad.lo.s32 	%r920, %r917, 12, %r919;
	shl.b32 	%r921, %r920, 2;
	add.s32 	%r243, %r351, %r921;
	shl.b32 	%r244, %r41, 2;
	mov.u32 	%r1101, 0;

BB14_139:
	setp.lt.s32	%p116, %r1101, 0;
	@%p116 bra 	BB14_149;

	add.s32 	%r926, %r1101, 1;
	mul.lo.s32 	%r927, %r926, %r1101;
	shr.u32 	%r928, %r927, 31;
	add.s32 	%r929, %r927, %r928;
	shr.s32 	%r248, %r929, 1;
	mad.lo.s32 	%r934, %r348, 12, %r1101;
	shl.b32 	%r935, %r934, 2;
	add.s32 	%r937, %r351, %r935;
	ld.shared.f32 	%f25, [%r937];
	and.b32  	%r249, %r926, 3;
	setp.eq.s32	%p117, %r249, 0;
	mov.u32 	%r1104, 0;
	@%p117 bra 	BB14_146;

	setp.eq.s32	%p118, %r249, 1;
	mov.u32 	%r1103, 0;
	@%p118 bra 	BB14_145;

	setp.eq.s32	%p119, %r249, 2;
	mov.u32 	%r1102, 0;
	@%p119 bra 	BB14_144;

	mad.lo.s32 	%r941, %r248, %r41, %r39;
	cvta.to.global.u64 	%rd189, %rd28;
	mul.wide.s32 	%rd190, %r941, 4;
	add.s64 	%rd191, %rd189, %rd190;
	ld.shared.f32 	%f293, [%r352];
	mul.ftz.f32 	%f294, %f25, %f293;
	mul.ftz.f32 	%f295, %f10, %f294;
	atom.global.add.f32 	%f296, [%rd191], %f295;
	mov.u32 	%r1102, 1;

BB14_144:
	add.s32 	%r950, %r248, %r1102;
	mad.lo.s32 	%r951, %r950, %r41, %r39;
	cvta.to.global.u64 	%rd192, %rd28;
	mul.wide.s32 	%rd193, %r951, 4;
	add.s64 	%rd194, %rd192, %rd193;
	mad.lo.s32 	%r956, %r348, 12, %r1102;
	shl.b32 	%r957, %r956, 2;
	add.s32 	%r959, %r351, %r957;
	ld.shared.f32 	%f297, [%r959];
	mul.ftz.f32 	%f298, %f25, %f297;
	mul.ftz.f32 	%f299, %f10, %f298;
	atom.global.add.f32 	%f300, [%rd194], %f299;
	add.s32 	%r1103, %r1102, 1;

BB14_145:
	add.s32 	%r960, %r248, %r1103;
	mad.lo.s32 	%r961, %r960, %r41, %r39;
	cvta.to.global.u64 	%rd195, %rd28;
	mul.wide.s32 	%rd196, %r961, 4;
	add.s64 	%rd197, %rd195, %rd196;
	mad.lo.s32 	%r966, %r348, 12, %r1103;
	shl.b32 	%r967, %r966, 2;
	add.s32 	%r969, %r351, %r967;
	ld.shared.f32 	%f301, [%r969];
	mul.ftz.f32 	%f302, %f25, %f301;
	mul.ftz.f32 	%f303, %f10, %f302;
	atom.global.add.f32 	%f304, [%rd197], %f303;
	add.s32 	%r1104, %r1103, 1;

BB14_146:
	setp.lt.u32	%p120, %r926, 4;
	@%p120 bra 	BB14_149;

	add.s32 	%r1107, %r1104, -1;
	shl.b32 	%r971, %r1104, 2;
	add.s32 	%r1106, %r243, %r971;
	add.s32 	%r972, %r1104, %r248;
	mul.lo.s32 	%r1105, %r41, %r972;
	cvta.to.global.u64 	%rd25, %rd28;

BB14_148:
	add.s32 	%r973, %r1105, %r39;
	mul.wide.s32 	%rd198, %r973, 4;
	add.s64 	%rd199, %rd25, %rd198;
	ld.shared.f32 	%f305, [%r1106];
	mul.ftz.f32 	%f306, %f25, %f305;
	mul.ftz.f32 	%f307, %f10, %f306;
	atom.global.add.f32 	%f308, [%rd199], %f307;
	ld.shared.f32 	%f309, [%r1106+4];
	mul.ftz.f32 	%f310, %f25, %f309;
	mul.ftz.f32 	%f311, %f10, %f310;
	cvt.s64.s32	%rd200, %r244;
	add.s64 	%rd201, %rd199, %rd200;
	atom.global.add.f32 	%f312, [%rd201], %f311;
	ld.shared.f32 	%f313, [%r1106+8];
	mul.ftz.f32 	%f314, %f25, %f313;
	mul.ftz.f32 	%f315, %f10, %f314;
	add.s64 	%rd202, %rd201, %rd200;
	atom.global.add.f32 	%f316, [%rd202], %f315;
	ld.shared.f32 	%f317, [%r1106+12];
	mul.ftz.f32 	%f318, %f25, %f317;
	mul.ftz.f32 	%f319, %f10, %f318;
	add.s64 	%rd203, %rd202, %rd200;
	atom.global.add.f32 	%f320, [%rd203], %f319;
	add.s32 	%r1106, %r1106, 16;
	add.s32 	%r1105, %r1105, %r244;
	add.s32 	%r1107, %r1107, 4;
	setp.lt.s32	%p121, %r1107, %r1101;
	@%p121 bra 	BB14_148;

BB14_149:
	setp.lt.s32	%p122, %r1101, %r1108;
	add.s32 	%r1101, %r1101, 1;
	@%p122 bra 	BB14_139;

	ld.global.u32 	%r1108, [%rd11];

BB14_151:
	mul.ftz.f32 	%f26, %f10, %f12;
	mul.ftz.f32 	%f27, %f10, %f13;
	mul.ftz.f32 	%f28, %f10, %f14;
	setp.lt.s32	%p123, %r1108, 0;
	@%p123 bra 	BB14_161;

	add.s32 	%r267, %r1108, 1;
	and.b32  	%r268, %r267, 3;
	setp.eq.s32	%p124, %r268, 0;
	mov.u32 	%r1111, 0;
	@%p124 bra 	BB14_158;

	setp.eq.s32	%p125, %r268, 1;
	mov.u32 	%r1110, 0;
	@%p125 bra 	BB14_157;

	setp.eq.s32	%p126, %r268, 2;
	mov.u32 	%r1109, 0;
	@%p126 bra 	BB14_156;

	cvta.to.global.u64 	%rd204, %rd29;
	mul.wide.s32 	%rd205, %r39, 12;
	add.s64 	%rd206, %rd204, %rd205;
	ld.shared.f32 	%f321, [%r352];
	mul.ftz.f32 	%f322, %f26, %f321;
	atom.global.add.f32 	%f323, [%rd206], %f322;
	mul.ftz.f32 	%f324, %f27, %f321;
	add.s64 	%rd207, %rd206, 4;
	atom.global.add.f32 	%f325, [%rd207], %f324;
	mul.ftz.f32 	%f326, %f28, %f321;
	add.s64 	%rd208, %rd206, 8;
	atom.global.add.f32 	%f327, [%rd208], %f326;
	mov.u32 	%r1109, 1;

BB14_156:
	neg.s32 	%r986, %r1109;
	and.b32  	%r987, %r41, %r986;
	add.s32 	%r988, %r987, %r39;
	cvta.to.global.u64 	%rd209, %rd29;
	mul.wide.s32 	%rd210, %r988, 12;
	add.s64 	%rd211, %rd209, %rd210;
	mad.lo.s32 	%r993, %r348, 12, %r1109;
	shl.b32 	%r994, %r993, 2;
	add.s32 	%r996, %r351, %r994;
	ld.shared.f32 	%f328, [%r996];
	mul.ftz.f32 	%f329, %f26, %f328;
	atom.global.add.f32 	%f330, [%rd211], %f329;
	mul.ftz.f32 	%f331, %f27, %f328;
	add.s64 	%rd212, %rd211, 4;
	atom.global.add.f32 	%f332, [%rd212], %f331;
	mul.ftz.f32 	%f333, %f28, %f328;
	add.s64 	%rd213, %rd211, 8;
	atom.global.add.f32 	%f334, [%rd213], %f333;
	add.s32 	%r1110, %r1109, 1;

BB14_157:
	mad.lo.s32 	%r997, %r1110, %r41, %r39;
	cvta.to.global.u64 	%rd214, %rd29;
	mul.wide.s32 	%rd215, %r997, 12;
	add.s64 	%rd216, %rd214, %rd215;
	mad.lo.s32 	%r1002, %r348, 12, %r1110;
	shl.b32 	%r1003, %r1002, 2;
	add.s32 	%r1005, %r351, %r1003;
	ld.shared.f32 	%f335, [%r1005];
	mul.ftz.f32 	%f336, %f26, %f335;
	atom.global.add.f32 	%f337, [%rd216], %f336;
	mul.ftz.f32 	%f338, %f27, %f335;
	add.s64 	%rd217, %rd216, 4;
	atom.global.add.f32 	%f339, [%rd217], %f338;
	mul.ftz.f32 	%f340, %f28, %f335;
	add.s64 	%rd218, %rd216, 8;
	atom.global.add.f32 	%f341, [%rd218], %f340;
	add.s32 	%r1111, %r1110, 1;

BB14_158:
	setp.lt.u32	%p127, %r267, 4;
	@%p127 bra 	BB14_161;

	add.s32 	%r1114, %r1111, -1;
	mul.lo.s32 	%r1008, %r347, %r2;
	mad.lo.s32 	%r1009, %r1008, 12, %r1111;
	mad.lo.s32 	%r1011, %r346, 12, %r1009;
	shl.b32 	%r1012, %r1011, 2;
	add.s32 	%r1113, %r351, %r1012;
	shl.b32 	%r276, %r41, 2;
	mul.lo.s32 	%r1017, %r1111, %r336;
	mul.lo.s32 	%r1112, %r1017, %r339;
	mul.lo.s32 	%r278, %r41, 12;
	add.s32 	%r279, %r278, 4;
	add.s32 	%r280, %r278, 8;
	cvta.to.global.u64 	%rd26, %rd29;

BB14_160:
	add.s32 	%r1018, %r1112, %r39;
	mul.wide.s32 	%rd219, %r1018, 12;
	add.s64 	%rd220, %rd26, %rd219;
	ld.shared.f32 	%f342, [%r1113];
	mul.ftz.f32 	%f343, %f26, %f342;
	atom.global.add.f32 	%f344, [%rd220], %f343;
	mul.ftz.f32 	%f345, %f27, %f342;
	add.s64 	%rd221, %rd220, 4;
	atom.global.add.f32 	%f346, [%rd221], %f345;
	mul.ftz.f32 	%f347, %f28, %f342;
	add.s64 	%rd222, %rd220, 8;
	atom.global.add.f32 	%f348, [%rd222], %f347;
	ld.shared.f32 	%f349, [%r1113+4];
	mul.ftz.f32 	%f350, %f26, %f349;
	cvt.s64.s32	%rd223, %r278;
	add.s64 	%rd224, %rd220, %rd223;
	atom.global.add.f32 	%f351, [%rd224], %f350;
	mul.ftz.f32 	%f352, %f27, %f349;
	add.s64 	%rd225, %rd221, %rd223;
	atom.global.add.f32 	%f353, [%rd225], %f352;
	mul.ftz.f32 	%f354, %f28, %f349;
	add.s64 	%rd226, %rd222, %rd223;
	atom.global.add.f32 	%f355, [%rd226], %f354;
	ld.shared.f32 	%f356, [%r1113+8];
	mul.ftz.f32 	%f357, %f26, %f356;
	add.s64 	%rd227, %rd224, %rd223;
	atom.global.add.f32 	%f358, [%rd227], %f357;
	mul.ftz.f32 	%f359, %f27, %f356;
	cvt.s64.s32	%rd228, %r279;
	add.s64 	%rd229, %rd224, %rd228;
	atom.global.add.f32 	%f360, [%rd229], %f359;
	mul.ftz.f32 	%f361, %f28, %f356;
	cvt.s64.s32	%rd230, %r280;
	add.s64 	%rd231, %rd224, %rd230;
	atom.global.add.f32 	%f362, [%rd231], %f361;
	ld.shared.f32 	%f363, [%r1113+12];
	mul.ftz.f32 	%f364, %f26, %f363;
	add.s64 	%rd232, %rd227, %rd223;
	atom.global.add.f32 	%f365, [%rd232], %f364;
	mul.ftz.f32 	%f366, %f27, %f363;
	add.s64 	%rd233, %rd227, %rd228;
	atom.global.add.f32 	%f367, [%rd233], %f366;
	mul.ftz.f32 	%f368, %f28, %f363;
	add.s64 	%rd234, %rd227, %rd230;
	atom.global.add.f32 	%f369, [%rd234], %f368;
	add.s32 	%r1113, %r1113, 16;
	add.s32 	%r1112, %r1112, %r276;
	add.s32 	%r1114, %r1114, 4;
	setp.lt.s32	%p128, %r1114, %r1108;
	@%p128 bra 	BB14_160;

BB14_161:
	ret;
}

	// .globl	kernel_cuda_filter_finalize
.visible .entry kernel_cuda_filter_finalize(
	.param .u64 kernel_cuda_filter_finalize_param_0,
	.param .u64 kernel_cuda_filter_finalize_param_1,
	.param .u64 kernel_cuda_filter_finalize_param_2,
	.param .u64 kernel_cuda_filter_finalize_param_3,
	.param .align 16 .b8 kernel_cuda_filter_finalize_param_4[16],
	.param .align 16 .b8 kernel_cuda_filter_finalize_param_5[16],
	.param .u32 kernel_cuda_filter_finalize_param_6
)
.maxntid 256, 1, 1
.minnctapersm 4
{
	.reg .pred 	%p<74>;
	.reg .f32 	%f<347>;
	.reg .b32 	%r<330>;
	.reg .b64 	%rd<144>;


	ld.param.u64 	%rd12, [kernel_cuda_filter_finalize_param_0];
	ld.param.u64 	%rd13, [kernel_cuda_filter_finalize_param_1];
	ld.param.u64 	%rd14, [kernel_cuda_filter_finalize_param_2];
	ld.param.u64 	%rd15, [kernel_cuda_filter_finalize_param_3];
	ld.param.u32 	%r4, [kernel_cuda_filter_finalize_param_4+12];
	ld.param.u32 	%r3, [kernel_cuda_filter_finalize_param_4+8];
	ld.param.v4.u32 	{%r90, %r91, %r92, %r93}, [kernel_cuda_filter_finalize_param_5];
	ld.param.u32 	%r89, [kernel_cuda_filter_finalize_param_6];
	mov.u32 	%r94, %ctaid.x;
	mov.u32 	%r95, %ntid.x;
	mov.u32 	%r96, %tid.x;
	mad.lo.s32 	%r1, %r94, %r95, %r96;
	mov.u32 	%r97, %ctaid.y;
	mov.u32 	%r98, %ntid.y;
	mov.u32 	%r99, %tid.y;
	mad.lo.s32 	%r2, %r97, %r98, %r99;
	setp.ge.s32	%p3, %r1, %r3;
	setp.ge.s32	%p4, %r2, %r4;
	or.pred  	%p5, %p3, %p4;
	@%p5 bra 	BB15_78;

	cvta.to.global.u64 	%rd1, %rd15;
	cvta.to.global.u64 	%rd16, %rd14;
	cvta.to.global.u64 	%rd2, %rd13;
	mad.lo.s32 	%r5, %r2, %r3, %r1;
	mul.wide.s32 	%rd17, %r5, 4;
	add.s64 	%rd18, %rd16, %rd17;
	mul.lo.s32 	%r6, %r3, %r4;
	ld.global.f32 	%f1, [%rd18];
	setp.lt.ftz.f32	%p6, %f1, 0f3A83126F;
	@%p6 bra 	BB15_78;

	mul.wide.s32 	%rd19, %r5, 12;
	add.s64 	%rd3, %rd1, %rd19;
	ld.global.f32 	%f2, [%rd3];
	ld.global.f32 	%f3, [%rd3+4];
	ld.global.f32 	%f4, [%rd3+8];
	mul.ftz.f32 	%f5, %f1, 0f34A10FB0;
	add.s64 	%rd21, %rd2, %rd17;
	ld.global.u32 	%r7, [%rd21];
	setp.lt.s32	%p7, %r7, 0;
	@%p7 bra 	BB15_11;

	add.s32 	%r101, %r7, 1;
	mov.u32 	%r102, 1;
	max.s32 	%r8, %r101, %r102;
	and.b32  	%r103, %r8, 3;
	setp.eq.s32	%p8, %r103, 0;
	mov.u32 	%r307, 0;
	@%p8 bra 	BB15_9;

	setp.eq.s32	%p9, %r103, 1;
	mov.u32 	%r305, 0;
	@%p9 bra 	BB15_8;

	setp.eq.s32	%p10, %r103, 2;
	mov.u32 	%r304, 0;
	@%p10 bra 	BB15_7;

	add.ftz.f32 	%f104, %f5, %f1;
	st.global.f32 	[%rd18], %f104;
	mov.u32 	%r304, %r102;

BB15_7:
	neg.s32 	%r109, %r304;
	add.s32 	%r305, %r304, 1;
	and.b32  	%r110, %r305, %r109;
	shr.u32 	%r111, %r110, 1;
	add.s32 	%r112, %r111, %r304;
	mad.lo.s32 	%r113, %r112, %r6, %r5;
	mul.wide.s32 	%rd26, %r113, 4;
	add.s64 	%rd27, %rd16, %rd26;
	ld.global.f32 	%f105, [%rd27];
	add.ftz.f32 	%f106, %f5, %f105;
	st.global.f32 	[%rd27], %f106;

BB15_8:
	add.s32 	%r307, %r305, 1;
	mul.lo.s32 	%r114, %r307, %r305;
	shr.u32 	%r115, %r114, 31;
	add.s32 	%r116, %r114, %r115;
	shr.s32 	%r117, %r116, 1;
	add.s32 	%r118, %r117, %r305;
	mad.lo.s32 	%r119, %r118, %r6, %r5;
	mul.wide.s32 	%rd29, %r119, 4;
	add.s64 	%rd30, %rd16, %rd29;
	ld.global.f32 	%f107, [%rd30];
	add.ftz.f32 	%f108, %f5, %f107;
	st.global.f32 	[%rd30], %f108;

BB15_9:
	setp.lt.u32	%p11, %r8, 4;
	@%p11 bra 	BB15_11;

BB15_10:
	add.s32 	%r120, %r307, 1;
	mul.lo.s32 	%r121, %r120, %r307;
	shr.u32 	%r122, %r121, 31;
	add.s32 	%r123, %r121, %r122;
	shr.s32 	%r124, %r123, 1;
	add.s32 	%r125, %r307, %r124;
	mad.lo.s32 	%r126, %r125, %r6, %r5;
	mul.wide.s32 	%rd31, %r126, 4;
	add.s64 	%rd32, %rd16, %rd31;
	ld.global.f32 	%f109, [%rd32];
	add.ftz.f32 	%f110, %f5, %f109;
	st.global.f32 	[%rd32], %f110;
	add.s32 	%r127, %r307, 2;
	mul.lo.s32 	%r128, %r127, %r120;
	shr.u32 	%r129, %r128, 31;
	add.s32 	%r130, %r128, %r129;
	shr.s32 	%r131, %r130, 1;
	add.s32 	%r132, %r120, %r131;
	mad.lo.s32 	%r133, %r132, %r6, %r5;
	mul.wide.s32 	%rd33, %r133, 4;
	add.s64 	%rd34, %rd16, %rd33;
	ld.global.f32 	%f111, [%rd34];
	add.ftz.f32 	%f112, %f5, %f111;
	st.global.f32 	[%rd34], %f112;
	add.s32 	%r134, %r307, 3;
	mul.lo.s32 	%r135, %r134, %r127;
	shr.u32 	%r136, %r135, 31;
	add.s32 	%r137, %r135, %r136;
	shr.s32 	%r138, %r137, 1;
	add.s32 	%r139, %r127, %r138;
	mad.lo.s32 	%r140, %r139, %r6, %r5;
	mul.wide.s32 	%rd35, %r140, 4;
	add.s64 	%rd36, %rd16, %rd35;
	ld.global.f32 	%f113, [%rd36];
	add.ftz.f32 	%f114, %f5, %f113;
	st.global.f32 	[%rd36], %f114;
	add.s32 	%r307, %r307, 4;
	mul.lo.s32 	%r141, %r307, %r134;
	shr.u32 	%r142, %r141, 31;
	add.s32 	%r143, %r141, %r142;
	shr.s32 	%r144, %r143, 1;
	add.s32 	%r145, %r134, %r144;
	mad.lo.s32 	%r146, %r145, %r6, %r5;
	mul.wide.s32 	%rd37, %r146, 4;
	add.s64 	%rd38, %rd16, %rd37;
	ld.global.f32 	%f115, [%rd38];
	add.ftz.f32 	%f116, %f5, %f115;
	st.global.f32 	[%rd38], %f116;
	setp.lt.s32	%p12, %r307, %r101;
	@%p12 bra 	BB15_10;

BB15_11:
	@%p7 bra 	BB15_31;

	mov.u32 	%r308, 0;

BB15_13:
	setp.lt.s32	%p14, %r308, 0;
	@%p14 bra 	BB15_30;

	add.s32 	%r150, %r308, 1;
	mul.lo.s32 	%r151, %r150, %r308;
	shr.u32 	%r152, %r151, 31;
	add.s32 	%r153, %r151, %r152;
	shr.s32 	%r17, %r153, 1;
	mov.u32 	%r309, 0;

BB15_15:
	add.s32 	%r154, %r17, %r309;
	mad.lo.s32 	%r155, %r154, %r6, %r5;
	mul.wide.s32 	%rd40, %r155, 4;
	add.s64 	%rd5, %rd16, %rd40;
	ld.global.f32 	%f6, [%rd5];
	setp.lt.s32	%p15, %r309, 1;
	@%p15 bra 	BB15_16;

	add.s32 	%r157, %r309, 1;
	mul.lo.s32 	%r158, %r157, %r309;
	shr.u32 	%r159, %r158, 31;
	add.s32 	%r160, %r158, %r159;
	shr.s32 	%r19, %r160, 1;
	and.b32  	%r20, %r309, 3;
	setp.eq.s32	%p16, %r20, 0;
	mov.f32 	%f306, 0f00000000;
	mov.u32 	%r315, 0;
	@%p16 bra 	BB15_23;

	setp.eq.s32	%p17, %r20, 1;
	mov.u32 	%r311, 0;
	@%p17 bra 	BB15_22;

	setp.eq.s32	%p18, %r20, 2;
	mov.u32 	%r310, 0;
	@%p18 bra 	BB15_21;

	mad.lo.s32 	%r173, %r17, %r6, %r5;
	mul.wide.s32 	%rd42, %r173, 4;
	add.s64 	%rd43, %rd16, %rd42;
	mad.lo.s32 	%r174, %r19, %r6, %r5;
	mul.wide.s32 	%rd44, %r174, 4;
	add.s64 	%rd45, %rd16, %rd44;
	ld.global.f32 	%f118, [%rd45];
	ld.global.f32 	%f119, [%rd43];
	mul.ftz.f32 	%f120, %f119, %f118;
	sub.ftz.f32 	%f6, %f6, %f120;
	mov.u32 	%r310, 1;

BB15_21:
	add.s32 	%r175, %r310, %r17;
	mad.lo.s32 	%r176, %r175, %r6, %r5;
	mul.wide.s32 	%rd47, %r176, 4;
	add.s64 	%rd48, %rd16, %rd47;
	add.s32 	%r177, %r19, %r310;
	mad.lo.s32 	%r178, %r177, %r6, %r5;
	mul.wide.s32 	%rd49, %r178, 4;
	add.s64 	%rd50, %rd16, %rd49;
	ld.global.f32 	%f121, [%rd50];
	ld.global.f32 	%f122, [%rd48];
	mul.ftz.f32 	%f123, %f122, %f121;
	sub.ftz.f32 	%f6, %f6, %f123;
	add.s32 	%r311, %r310, 1;

BB15_22:
	add.s32 	%r179, %r311, %r17;
	mad.lo.s32 	%r180, %r179, %r6, %r5;
	mul.wide.s32 	%rd52, %r180, 4;
	add.s64 	%rd53, %rd16, %rd52;
	add.s32 	%r181, %r19, %r311;
	mad.lo.s32 	%r182, %r181, %r6, %r5;
	mul.wide.s32 	%rd54, %r182, 4;
	add.s64 	%rd55, %rd16, %rd54;
	ld.global.f32 	%f124, [%rd55];
	ld.global.f32 	%f125, [%rd53];
	mul.ftz.f32 	%f126, %f125, %f124;
	sub.ftz.f32 	%f6, %f6, %f126;
	add.s32 	%r315, %r311, 1;
	mov.f32 	%f306, %f6;

BB15_23:
	setp.lt.u32	%p19, %r309, 4;
	@%p19 bra 	BB15_26;

	add.s32 	%r183, %r315, %r19;
	mul.lo.s32 	%r314, %r6, %r183;
	add.s32 	%r185, %r17, %r315;
	mul.lo.s32 	%r313, %r6, %r185;
	shl.b32 	%r28, %r6, 2;
	mov.f32 	%f306, %f6;

BB15_25:
	add.s32 	%r186, %r313, %r5;
	mul.wide.s32 	%rd56, %r186, 4;
	add.s64 	%rd57, %rd16, %rd56;
	add.s32 	%r187, %r314, %r5;
	mul.wide.s32 	%rd58, %r187, 4;
	add.s64 	%rd59, %rd16, %rd58;
	ld.global.f32 	%f127, [%rd59];
	ld.global.f32 	%f128, [%rd57];
	mul.ftz.f32 	%f129, %f128, %f127;
	sub.ftz.f32 	%f130, %f306, %f129;
	cvt.s64.s32	%rd60, %r28;
	add.s64 	%rd61, %rd57, %rd60;
	add.s64 	%rd62, %rd59, %rd60;
	ld.global.f32 	%f131, [%rd62];
	ld.global.f32 	%f132, [%rd61];
	mul.ftz.f32 	%f133, %f132, %f131;
	sub.ftz.f32 	%f134, %f130, %f133;
	add.s64 	%rd63, %rd61, %rd60;
	add.s64 	%rd64, %rd62, %rd60;
	ld.global.f32 	%f135, [%rd64];
	ld.global.f32 	%f136, [%rd63];
	mul.ftz.f32 	%f137, %f136, %f135;
	sub.ftz.f32 	%f138, %f134, %f137;
	add.s64 	%rd65, %rd63, %rd60;
	add.s64 	%rd66, %rd64, %rd60;
	ld.global.f32 	%f139, [%rd66];
	ld.global.f32 	%f140, [%rd65];
	mul.ftz.f32 	%f141, %f140, %f139;
	sub.ftz.f32 	%f306, %f138, %f141;
	add.s32 	%r314, %r314, %r28;
	add.s32 	%r313, %r313, %r28;
	add.s32 	%r315, %r315, 4;
	setp.lt.s32	%p20, %r315, %r309;
	@%p20 bra 	BB15_25;
	bra.uni 	BB15_26;

BB15_16:
	mov.f32 	%f306, %f6;

BB15_26:
	setp.eq.s32	%p21, %r308, %r309;
	@%p21 bra 	BB15_28;
	bra.uni 	BB15_27;

BB15_28:
	mov.f32 	%f143, 0f00000000;
	max.ftz.f32 	%f144, %f306, %f143;
	sqrt.approx.ftz.f32 	%f307, %f144;
	bra.uni 	BB15_29;

BB15_27:
	add.s32 	%r188, %r309, 1;
	mul.lo.s32 	%r189, %r188, %r309;
	shr.u32 	%r190, %r189, 31;
	add.s32 	%r191, %r189, %r190;
	shr.s32 	%r192, %r191, 1;
	add.s32 	%r193, %r192, %r309;
	mad.lo.s32 	%r194, %r193, %r6, %r5;
	mul.wide.s32 	%rd68, %r194, 4;
	add.s64 	%rd69, %rd16, %rd68;
	ld.global.f32 	%f142, [%rd69];
	div.approx.ftz.f32 	%f307, %f306, %f142;

BB15_29:
	st.global.f32 	[%rd5], %f307;
	add.s32 	%r36, %r309, 1;
	setp.lt.s32	%p22, %r309, %r308;
	mov.u32 	%r309, %r36;
	@%p22 bra 	BB15_15;

BB15_30:
	add.s32 	%r195, %r7, 1;
	add.s32 	%r308, %r308, 1;
	setp.lt.s32	%p23, %r308, %r195;
	@%p23 bra 	BB15_13;

BB15_31:
	rcp.approx.ftz.f32 	%f145, %f1;
	mul.ftz.f32 	%f20, %f145, %f2;
	mul.ftz.f32 	%f21, %f145, %f3;
	mul.ftz.f32 	%f22, %f145, %f4;
	@%p7 bra 	BB15_46;

	mov.u32 	%r316, 0;

BB15_33:
	mov.u32 	%r38, %r316;
	mad.lo.s32 	%r197, %r38, %r6, %r5;
	mul.wide.s32 	%rd71, %r197, 12;
	add.s64 	%rd7, %rd1, %rd71;
	ld.global.f32 	%f23, [%rd7];
	ld.global.f32 	%f24, [%rd7+4];
	ld.global.f32 	%f25, [%rd7+8];
	add.s32 	%r316, %r38, 1;
	mul.lo.s32 	%r199, %r316, %r38;
	shr.u32 	%r200, %r199, 31;
	add.s32 	%r201, %r199, %r200;
	shr.s32 	%r39, %r201, 1;
	setp.lt.s32	%p25, %r38, 1;
	@%p25 bra 	BB15_34;

	and.b32  	%r40, %r38, 3;
	setp.eq.s32	%p26, %r40, 0;
	mov.f32 	%f323, 0f00000000;
	mov.u32 	%r322, 0;
	@%p26 bra 	BB15_36;
	bra.uni 	BB15_37;

BB15_36:
	mov.f32 	%f324, %f323;
	mov.f32 	%f325, %f323;
	bra.uni 	BB15_42;

BB15_34:
	mov.f32 	%f323, %f25;
	mov.f32 	%f324, %f24;
	mov.f32 	%f325, %f23;
	bra.uni 	BB15_45;

BB15_37:
	setp.eq.s32	%p27, %r40, 1;
	mov.u32 	%r318, 0;
	@%p27 bra 	BB15_41;

	setp.eq.s32	%p28, %r40, 2;
	mov.u32 	%r317, 0;
	@%p28 bra 	BB15_40;

	mad.lo.s32 	%r206, %r39, %r6, %r5;
	mul.wide.s32 	%rd73, %r206, 4;
	add.s64 	%rd74, %rd16, %rd73;
	ld.global.f32 	%f149, [%rd3];
	ld.global.f32 	%f150, [%rd74];
	mul.ftz.f32 	%f151, %f150, %f149;
	ld.global.f32 	%f152, [%rd3+4];
	mul.ftz.f32 	%f153, %f150, %f152;
	ld.global.f32 	%f154, [%rd3+8];
	mul.ftz.f32 	%f155, %f150, %f154;
	sub.ftz.f32 	%f23, %f23, %f151;
	sub.ftz.f32 	%f24, %f24, %f153;
	sub.ftz.f32 	%f25, %f25, %f155;
	mov.u32 	%r317, 1;

BB15_40:
	add.s32 	%r207, %r39, %r317;
	mad.lo.s32 	%r208, %r207, %r6, %r5;
	mul.wide.s32 	%rd76, %r208, 4;
	add.s64 	%rd77, %rd16, %rd76;
	neg.s32 	%r209, %r317;
	and.b32  	%r210, %r6, %r209;
	add.s32 	%r211, %r210, %r5;
	mul.wide.s32 	%rd79, %r211, 12;
	add.s64 	%rd80, %rd1, %rd79;
	ld.global.f32 	%f156, [%rd80];
	ld.global.f32 	%f157, [%rd77];
	mul.ftz.f32 	%f158, %f157, %f156;
	ld.global.f32 	%f159, [%rd80+4];
	mul.ftz.f32 	%f160, %f157, %f159;
	ld.global.f32 	%f161, [%rd80+8];
	mul.ftz.f32 	%f162, %f157, %f161;
	sub.ftz.f32 	%f23, %f23, %f158;
	sub.ftz.f32 	%f24, %f24, %f160;
	sub.ftz.f32 	%f25, %f25, %f162;
	add.s32 	%r318, %r317, 1;

BB15_41:
	add.s32 	%r212, %r39, %r318;
	mad.lo.s32 	%r213, %r212, %r6, %r5;
	mul.wide.s32 	%rd82, %r213, 4;
	add.s64 	%rd83, %rd16, %rd82;
	mad.lo.s32 	%r214, %r318, %r6, %r5;
	mul.wide.s32 	%rd85, %r214, 12;
	add.s64 	%rd86, %rd1, %rd85;
	ld.global.f32 	%f163, [%rd86];
	ld.global.f32 	%f164, [%rd83];
	mul.ftz.f32 	%f165, %f164, %f163;
	ld.global.f32 	%f166, [%rd86+4];
	mul.ftz.f32 	%f167, %f164, %f166;
	ld.global.f32 	%f168, [%rd86+8];
	mul.ftz.f32 	%f169, %f164, %f168;
	sub.ftz.f32 	%f23, %f23, %f165;
	sub.ftz.f32 	%f24, %f24, %f167;
	sub.ftz.f32 	%f25, %f25, %f169;
	add.s32 	%r322, %r318, 1;
	mov.f32 	%f323, %f25;
	mov.f32 	%f324, %f24;
	mov.f32 	%f325, %f23;

BB15_42:
	setp.lt.u32	%p29, %r38, 4;
	@%p29 bra 	BB15_45;

	add.s32 	%r215, %r322, %r39;
	mul.lo.s32 	%r321, %r6, %r215;
	mul.lo.s32 	%r320, %r6, %r322;
	mov.f32 	%f323, %f25;
	mov.f32 	%f324, %f24;
	mov.f32 	%f325, %f23;

BB15_44:
	add.s32 	%r217, %r321, %r5;
	mul.wide.s32 	%rd88, %r217, 4;
	add.s64 	%rd89, %rd16, %rd88;
	add.s32 	%r218, %r320, %r5;
	mul.wide.s32 	%rd91, %r218, 12;
	add.s64 	%rd92, %rd1, %rd91;
	ld.global.f32 	%f170, [%rd92];
	ld.global.f32 	%f171, [%rd89];
	mul.ftz.f32 	%f172, %f171, %f170;
	ld.global.f32 	%f173, [%rd92+4];
	mul.ftz.f32 	%f174, %f171, %f173;
	ld.global.f32 	%f175, [%rd92+8];
	mul.ftz.f32 	%f176, %f171, %f175;
	sub.ftz.f32 	%f177, %f325, %f172;
	sub.ftz.f32 	%f178, %f324, %f174;
	sub.ftz.f32 	%f179, %f323, %f176;
	shl.b32 	%r220, %r6, 2;
	cvt.s64.s32	%rd93, %r220;
	add.s64 	%rd94, %rd89, %rd93;
	mul.lo.s32 	%r221, %r6, 12;
	cvt.s64.s32	%rd95, %r221;
	add.s64 	%rd96, %rd92, %rd95;
	ld.global.f32 	%f180, [%rd96];
	ld.global.f32 	%f181, [%rd94];
	mul.ftz.f32 	%f182, %f181, %f180;
	ld.global.f32 	%f183, [%rd96+4];
	mul.ftz.f32 	%f184, %f181, %f183;
	ld.global.f32 	%f185, [%rd96+8];
	mul.ftz.f32 	%f186, %f181, %f185;
	sub.ftz.f32 	%f187, %f177, %f182;
	sub.ftz.f32 	%f188, %f178, %f184;
	sub.ftz.f32 	%f189, %f179, %f186;
	add.s64 	%rd97, %rd94, %rd93;
	add.s64 	%rd98, %rd96, %rd95;
	ld.global.f32 	%f190, [%rd98];
	ld.global.f32 	%f191, [%rd97];
	mul.ftz.f32 	%f192, %f191, %f190;
	ld.global.f32 	%f193, [%rd98+4];
	mul.ftz.f32 	%f194, %f191, %f193;
	ld.global.f32 	%f195, [%rd98+8];
	mul.ftz.f32 	%f196, %f191, %f195;
	sub.ftz.f32 	%f197, %f187, %f192;
	sub.ftz.f32 	%f198, %f188, %f194;
	sub.ftz.f32 	%f199, %f189, %f196;
	add.s64 	%rd99, %rd97, %rd93;
	add.s64 	%rd100, %rd98, %rd95;
	ld.global.f32 	%f200, [%rd100];
	ld.global.f32 	%f201, [%rd99];
	mul.ftz.f32 	%f202, %f201, %f200;
	ld.global.f32 	%f203, [%rd100+4];
	mul.ftz.f32 	%f204, %f201, %f203;
	ld.global.f32 	%f205, [%rd100+8];
	mul.ftz.f32 	%f206, %f201, %f205;
	sub.ftz.f32 	%f325, %f197, %f202;
	sub.ftz.f32 	%f324, %f198, %f204;
	sub.ftz.f32 	%f323, %f199, %f206;
	add.s32 	%r321, %r321, %r220;
	add.s32 	%r320, %r320, %r220;
	add.s32 	%r322, %r322, 4;
	setp.lt.s32	%p30, %r322, %r38;
	@%p30 bra 	BB15_44;

BB15_45:
	add.s32 	%r222, %r39, %r38;
	mad.lo.s32 	%r223, %r222, %r6, %r5;
	mul.wide.s32 	%rd102, %r223, 4;
	add.s64 	%rd103, %rd16, %rd102;
	ld.global.f32 	%f207, [%rd103];
	rcp.approx.ftz.f32 	%f208, %f207;
	mul.ftz.f32 	%f209, %f325, %f208;
	mul.ftz.f32 	%f210, %f324, %f208;
	mul.ftz.f32 	%f211, %f323, %f208;
	st.global.f32 	[%rd7], %f209;
	st.global.f32 	[%rd7+4], %f210;
	st.global.f32 	[%rd7+8], %f211;
	add.s32 	%r224, %r7, 1;
	setp.lt.s32	%p31, %r316, %r224;
	@%p31 bra 	BB15_33;

BB15_46:
	@%p7 bra 	BB15_61;

	shl.b32 	%r55, %r6, 2;
	mul.lo.s32 	%r57, %r6, 12;
	mov.u32 	%r323, 0;
	mov.u32 	%r324, %r7;

BB15_48:
	mad.lo.s32 	%r226, %r324, %r6, %r5;
	mul.wide.s32 	%rd104, %r226, 12;
	add.s64 	%rd10, %rd1, %rd104;
	ld.global.f32 	%f56, [%rd10];
	ld.global.f32 	%f57, [%rd10+4];
	ld.global.f32 	%f58, [%rd10+8];
	add.s32 	%r227, %r324, 1;
	add.s32 	%r228, %r7, 1;
	setp.ge.s32	%p33, %r227, %r228;
	@%p33 bra 	BB15_49;

	and.b32  	%r60, %r323, 3;
	setp.eq.s32	%p34, %r60, 0;
	mov.f32 	%f341, 0f00000000;
	@%p34 bra 	BB15_51;
	bra.uni 	BB15_52;

BB15_51:
	mov.u32 	%r329, %r227;
	mov.f32 	%f342, %f341;
	mov.f32 	%f343, %f341;
	bra.uni 	BB15_57;

BB15_49:
	mov.f32 	%f341, %f58;
	mov.f32 	%f342, %f57;
	mov.f32 	%f343, %f56;
	bra.uni 	BB15_60;

BB15_52:
	setp.eq.s32	%p35, %r60, 1;
	add.s32 	%r326, %r324, 1;
	@%p35 bra 	BB15_56;

	setp.eq.s32	%p36, %r60, 2;
	add.s32 	%r325, %r324, 1;
	@%p36 bra 	BB15_55;

	add.s32 	%r229, %r324, 1;
	add.s32 	%r325, %r324, 2;
	mul.lo.s32 	%r230, %r325, %r229;
	shr.u32 	%r231, %r230, 31;
	add.s32 	%r232, %r230, %r231;
	shr.s32 	%r233, %r232, 1;
	add.s32 	%r234, %r233, %r324;
	mad.lo.s32 	%r235, %r234, %r6, %r5;
	mul.wide.s32 	%rd106, %r235, 4;
	add.s64 	%rd107, %rd16, %rd106;
	mad.lo.s32 	%r236, %r229, %r6, %r5;
	mul.wide.s32 	%rd109, %r236, 12;
	add.s64 	%rd110, %rd1, %rd109;
	ld.global.f32 	%f215, [%rd110];
	ld.global.f32 	%f216, [%rd107];
	mul.ftz.f32 	%f217, %f216, %f215;
	ld.global.f32 	%f218, [%rd110+4];
	mul.ftz.f32 	%f219, %f216, %f218;
	ld.global.f32 	%f220, [%rd110+8];
	mul.ftz.f32 	%f221, %f216, %f220;
	sub.ftz.f32 	%f56, %f56, %f217;
	sub.ftz.f32 	%f57, %f57, %f219;
	sub.ftz.f32 	%f58, %f58, %f221;

BB15_55:
	add.s32 	%r326, %r325, 1;
	mul.lo.s32 	%r237, %r326, %r325;
	shr.u32 	%r238, %r237, 31;
	add.s32 	%r239, %r237, %r238;
	shr.s32 	%r240, %r239, 1;
	add.s32 	%r241, %r240, %r324;
	mad.lo.s32 	%r242, %r241, %r6, %r5;
	mul.wide.s32 	%rd112, %r242, 4;
	add.s64 	%rd113, %rd16, %rd112;
	mad.lo.s32 	%r243, %r325, %r6, %r5;
	mul.wide.s32 	%rd115, %r243, 12;
	add.s64 	%rd116, %rd1, %rd115;
	ld.global.f32 	%f222, [%rd116];
	ld.global.f32 	%f223, [%rd113];
	mul.ftz.f32 	%f224, %f223, %f222;
	ld.global.f32 	%f225, [%rd116+4];
	mul.ftz.f32 	%f226, %f223, %f225;
	ld.global.f32 	%f227, [%rd116+8];
	mul.ftz.f32 	%f228, %f223, %f227;
	sub.ftz.f32 	%f56, %f56, %f224;
	sub.ftz.f32 	%f57, %f57, %f226;
	sub.ftz.f32 	%f58, %f58, %f228;

BB15_56:
	add.s32 	%r329, %r326, 1;
	mul.lo.s32 	%r244, %r329, %r326;
	shr.u32 	%r245, %r244, 31;
	add.s32 	%r246, %r244, %r245;
	shr.s32 	%r247, %r246, 1;
	add.s32 	%r248, %r247, %r324;
	mad.lo.s32 	%r249, %r248, %r6, %r5;
	mul.wide.s32 	%rd118, %r249, 4;
	add.s64 	%rd119, %rd16, %rd118;
	mad.lo.s32 	%r250, %r326, %r6, %r5;
	mul.wide.s32 	%rd121, %r250, 12;
	add.s64 	%rd122, %rd1, %rd121;
	ld.global.f32 	%f229, [%rd122];
	ld.global.f32 	%f230, [%rd119];
	mul.ftz.f32 	%f231, %f230, %f229;
	ld.global.f32 	%f232, [%rd122+4];
	mul.ftz.f32 	%f233, %f230, %f232;
	ld.global.f32 	%f234, [%rd122+8];
	mul.ftz.f32 	%f235, %f230, %f234;
	sub.ftz.f32 	%f56, %f56, %f231;
	sub.ftz.f32 	%f57, %f57, %f233;
	sub.ftz.f32 	%f58, %f58, %f235;
	mov.f32 	%f341, %f58;
	mov.f32 	%f342, %f57;
	mov.f32 	%f343, %f56;

BB15_57:
	setp.lt.u32	%p37, %r323, 4;
	@%p37 bra 	BB15_60;

	mul.lo.s32 	%r328, %r6, %r329;
	mov.f32 	%f341, %f58;
	mov.f32 	%f342, %f57;
	mov.f32 	%f343, %f56;

BB15_59:
	add.s32 	%r251, %r329, 1;
	mul.lo.s32 	%r252, %r251, %r329;
	shr.u32 	%r253, %r252, 31;
	add.s32 	%r254, %r252, %r253;
	shr.s32 	%r255, %r254, 1;
	add.s32 	%r256, %r255, %r324;
	mad.lo.s32 	%r257, %r256, %r6, %r5;
	mul.wide.s32 	%rd123, %r257, 4;
	add.s64 	%rd124, %rd16, %rd123;
	add.s32 	%r258, %r328, %r5;
	mul.wide.s32 	%rd125, %r258, 12;
	add.s64 	%rd126, %rd1, %rd125;
	ld.global.f32 	%f236, [%rd126];
	ld.global.f32 	%f237, [%rd124];
	mul.ftz.f32 	%f238, %f237, %f236;
	ld.global.f32 	%f239, [%rd126+4];
	mul.ftz.f32 	%f240, %f237, %f239;
	ld.global.f32 	%f241, [%rd126+8];
	mul.ftz.f32 	%f242, %f237, %f241;
	sub.ftz.f32 	%f243, %f343, %f238;
	sub.ftz.f32 	%f244, %f342, %f240;
	sub.ftz.f32 	%f245, %f341, %f242;
	add.s32 	%r259, %r329, 2;
	mul.lo.s32 	%r260, %r259, %r251;
	shr.u32 	%r261, %r260, 31;
	add.s32 	%r262, %r260, %r261;
	shr.s32 	%r263, %r262, 1;
	add.s32 	%r264, %r263, %r324;
	mad.lo.s32 	%r265, %r264, %r6, %r5;
	mul.wide.s32 	%rd127, %r265, 4;
	add.s64 	%rd128, %rd16, %rd127;
	cvt.s64.s32	%rd129, %r57;
	add.s64 	%rd130, %rd126, %rd129;
	ld.global.f32 	%f246, [%rd130];
	ld.global.f32 	%f247, [%rd128];
	mul.ftz.f32 	%f248, %f247, %f246;
	ld.global.f32 	%f249, [%rd130+4];
	mul.ftz.f32 	%f250, %f247, %f249;
	ld.global.f32 	%f251, [%rd130+8];
	mul.ftz.f32 	%f252, %f247, %f251;
	sub.ftz.f32 	%f253, %f243, %f248;
	sub.ftz.f32 	%f254, %f244, %f250;
	sub.ftz.f32 	%f255, %f245, %f252;
	add.s32 	%r266, %r329, 3;
	mul.lo.s32 	%r267, %r266, %r259;
	shr.u32 	%r268, %r267, 31;
	add.s32 	%r269, %r267, %r268;
	shr.s32 	%r270, %r269, 1;
	add.s32 	%r271, %r270, %r324;
	mad.lo.s32 	%r272, %r271, %r6, %r5;
	mul.wide.s32 	%rd131, %r272, 4;
	add.s64 	%rd132, %rd16, %rd131;
	add.s64 	%rd133, %rd130, %rd129;
	ld.global.f32 	%f256, [%rd133];
	ld.global.f32 	%f257, [%rd132];
	mul.ftz.f32 	%f258, %f257, %f256;
	ld.global.f32 	%f259, [%rd133+4];
	mul.ftz.f32 	%f260, %f257, %f259;
	ld.global.f32 	%f261, [%rd133+8];
	mul.ftz.f32 	%f262, %f257, %f261;
	sub.ftz.f32 	%f263, %f253, %f258;
	sub.ftz.f32 	%f264, %f254, %f260;
	sub.ftz.f32 	%f265, %f255, %f262;
	add.s32 	%r329, %r329, 4;
	mul.lo.s32 	%r273, %r329, %r266;
	shr.u32 	%r274, %r273, 31;
	add.s32 	%r275, %r273, %r274;
	shr.s32 	%r276, %r275, 1;
	add.s32 	%r277, %r276, %r324;
	mad.lo.s32 	%r278, %r277, %r6, %r5;
	mul.wide.s32 	%rd134, %r278, 4;
	add.s64 	%rd135, %rd16, %rd134;
	add.s64 	%rd136, %rd133, %rd129;
	ld.global.f32 	%f266, [%rd136];
	ld.global.f32 	%f267, [%rd135];
	mul.ftz.f32 	%f268, %f267, %f266;
	ld.global.f32 	%f269, [%rd136+4];
	mul.ftz.f32 	%f270, %f267, %f269;
	ld.global.f32 	%f271, [%rd136+8];
	mul.ftz.f32 	%f272, %f267, %f271;
	sub.ftz.f32 	%f343, %f263, %f268;
	sub.ftz.f32 	%f342, %f264, %f270;
	sub.ftz.f32 	%f341, %f265, %f272;
	add.s32 	%r328, %r328, %r55;
	setp.lt.s32	%p38, %r329, %r228;
	@%p38 bra 	BB15_59;

BB15_60:
	mul.lo.s32 	%r281, %r324, %r227;
	shr.u32 	%r282, %r281, 31;
	add.s32 	%r283, %r281, %r282;
	shr.s32 	%r284, %r283, 1;
	add.s32 	%r285, %r284, %r324;
	mad.lo.s32 	%r286, %r285, %r6, %r5;
	mul.wide.s32 	%rd137, %r286, 4;
	add.s64 	%rd138, %rd16, %rd137;
	ld.global.f32 	%f273, [%rd138];
	rcp.approx.ftz.f32 	%f274, %f273;
	mul.ftz.f32 	%f275, %f343, %f274;
	mul.ftz.f32 	%f276, %f342, %f274;
	mul.ftz.f32 	%f277, %f341, %f274;
	st.global.f32 	[%rd10], %f275;
	st.global.f32 	[%rd10+4], %f276;
	st.global.f32 	[%rd10+8], %f277;
	add.s32 	%r324, %r324, -1;
	setp.gt.s32	%p39, %r324, -1;
	add.s32 	%r323, %r323, 1;
	@%p39 bra 	BB15_48;

BB15_61:
	ld.global.f32 	%f278, [%rd3];
	ld.global.f32 	%f90, [%rd3+4];
	ld.global.f32 	%f91, [%rd3+8];
	// inline asm
	mov.b32 	%r287, %f278;
	// inline asm
	mov.pred 	%p73, -1;
	setp.nan.ftz.f32	%p41, %f278, %f278;
	@%p41 bra 	BB15_74;

	and.b32  	%r288, %r287, 2147483647;
	setp.eq.s32	%p42, %r288, 0;
	shl.b32 	%r78, %r287, 1;
	@%p42 bra 	BB15_64;

	setp.lt.u32	%p44, %r78, -16777215;
	add.ftz.f32 	%f279, %f278, %f278;
	setp.neu.ftz.f32	%p45, %f278, %f279;
	and.pred  	%p46, %p45, %p44;
	@%p46 bra 	BB15_65;
	bra.uni 	BB15_74;

BB15_64:
	setp.gt.u32	%p48, %r78, -16777216;
	@%p48 bra 	BB15_74;

BB15_65:
	// inline asm
	mov.b32 	%r289, %f90;
	// inline asm
	setp.nan.ftz.f32	%p50, %f90, %f90;
	@%p50 bra 	BB15_74;

	and.b32  	%r290, %r289, 2147483647;
	setp.eq.s32	%p51, %r290, 0;
	shl.b32 	%r80, %r289, 1;
	@%p51 bra 	BB15_68;

	setp.lt.u32	%p53, %r80, -16777215;
	add.ftz.f32 	%f281, %f90, %f90;
	setp.neu.ftz.f32	%p54, %f90, %f281;
	and.pred  	%p55, %p54, %p53;
	@%p55 bra 	BB15_69;
	bra.uni 	BB15_74;

BB15_68:
	setp.gt.u32	%p57, %r80, -16777216;
	@%p57 bra 	BB15_74;

BB15_69:
	// inline asm
	mov.b32 	%r291, %f91;
	// inline asm
	setp.nan.ftz.f32	%p59, %f91, %f91;
	@%p59 bra 	BB15_74;

	and.b32  	%r292, %r291, 2147483647;
	setp.eq.s32	%p60, %r292, 0;
	shl.b32 	%r82, %r291, 1;
	@%p60 bra 	BB15_72;

	setp.gt.u32	%p62, %r82, -16777216;
	add.ftz.f32 	%f283, %f91, %f91;
	setp.eq.ftz.f32	%p63, %f91, %f283;
	or.pred  	%p64, %p63, %p62;
	@%p64 bra 	BB15_74;
	bra.uni 	BB15_73;

BB15_72:
	setp.gt.u32	%p66, %r82, -16777216;
	@%p66 bra 	BB15_74;

BB15_73:
	setp.lt.ftz.f32	%p67, %f278, 0fBC23D70A;
	setp.lt.ftz.f32	%p68, %f90, 0fBC23D70A;
	or.pred  	%p69, %p67, %p68;
	setp.lt.ftz.f32	%p70, %f91, 0fBC23D70A;
	or.pred  	%p73, %p69, %p70;

BB15_74:
	selp.f32	%f284, %f20, %f278, %p73;
	mov.f32 	%f285, 0f00000000;
	max.ftz.f32 	%f286, %f284, %f285;
	selp.f32	%f287, %f21, %f90, %p73;
	max.ftz.f32 	%f288, %f287, %f285;
	selp.f32	%f289, %f22, %f91, %p73;
	max.ftz.f32 	%f290, %f289, %f285;
	mul.ftz.f32 	%f291, %f286, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f292, %f291;
	mul.ftz.f32 	%f293, %f288, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f294, %f293;
	mul.ftz.f32 	%f295, %f290, 0f3FB8AA3B;
	ex2.approx.ftz.f32 	%f296, %f295;
	add.ftz.f32 	%f346, %f292, 0fBF800000;
	add.ftz.f32 	%f345, %f294, 0fBF800000;
	add.ftz.f32 	%f344, %f296, 0fBF800000;
	add.s32 	%r301, %r1, %r90;
	mad.lo.s32 	%r302, %r2, %r91, %r301;
	mul.lo.s32 	%r84, %r302, %r92;
	cvta.to.global.u64 	%rd11, %rd12;
	setp.lt.s32	%p71, %r93, 0;
	@%p71 bra 	BB15_77;

	cvt.rn.f32.s32	%f297, %r89;
	mul.ftz.f32 	%f346, %f297, %f346;
	mul.ftz.f32 	%f345, %f297, %f345;
	mul.ftz.f32 	%f344, %f297, %f344;
	setp.lt.s32	%p72, %r93, 1;
	@%p72 bra 	BB15_77;

	add.s32 	%r303, %r84, %r93;
	mul.wide.s32 	%rd140, %r303, 4;
	add.s64 	%rd141, %rd11, %rd140;
	ld.global.f32 	%f298, [%rd141];
	add.ftz.f32 	%f346, %f346, %f298;
	ld.global.f32 	%f299, [%rd141+4];
	add.ftz.f32 	%f345, %f345, %f299;
	ld.global.f32 	%f300, [%rd141+8];
	add.ftz.f32 	%f344, %f344, %f300;

BB15_77:
	mul.wide.s32 	%rd142, %r84, 4;
	add.s64 	%rd143, %rd11, %rd142;
	st.global.f32 	[%rd143], %f346;
	st.global.f32 	[%rd143+4], %f345;
	st.global.f32 	[%rd143+8], %f344;

BB15_78:
	ret;
}


